My Profile Photo

ML ideabook

A notebook for Coursera Machine Learning course ideas


The final week. Not much to explain, simply print the plots. Check it here . I used proc freq for frequency distribution plot, since the code doesn’t make sense if proc univariate is used.

It seems people with at least moderate income and distribution is normal, but the plot shows no statistical significance.

LIBNAME mydata "/courses/d1406ae5ba27fe300 " access=readonly;

*** Make sure to establish your own lib, 'work' is temporary;
libname worklib '~/worklib'; 

proc format;
*** So we want the output look nicer with these custom formats;

	value AMP	1='Yes'
	value inc	0=' $0 (No personal income)'
				1= '$1 to $4,999'
				2= '$5,000 to $7,999'
				3= '$8,000 to $9,999'
				4= '$10,000 to $12,999'
				5= '$13,000 to $14,999'
				6= '$15,000 to $19,999'
				7= '$20,000 to $24,999'
				8= '$25,000 to $29,999'
				9= '$30,000 to $34,999'
				10='$35,000 to $39,999'
				11='$40,000 to $49,999'
				12='$50,000 to $59,999'
				13='$60,000 to $69,999'
				14='$70,000 to $79,999'
				15='$80,000 to $89,999'
				16='$90,000 to $99,999'
				17='$100,000 or more';
	value ofn	1= 'Every day'
				2= 'Nearly every day'
				3= '3 to 4 times a week'
				4= '1 to 2 times a week'
				5= '2 to 3 times a month'
				6= 'Once a month'
				7= '7 to 11 times a year'
				8= '3 to 6 times a year'
				9= '2 times a year'
				10= 'Once a year'
				99= 'Unknown';

data worklib.working;
	set mydata.nesarc_pds;
		** Count who have tried amphetamines using a secondary var;
		if S3BQ1A4 = 1 then amped+1;
	** Another effort to make code interpretable by human;
	label 	S3BQ1A4= 'Ever used amphetamines'
			S1Q10A = 'Total personal income in last 12 months'
			S1Q10B = 'Total personal income in last 12 months:Category'
			S3BD4Q2E = 'How often used amphetamines when using the most';
		** Setting Unknown values as missing;
			** Tell me if you're REALLY unsure, not you don't use drugs;
			when (S3BD4Q2E = 99) S3BD4Q2E= .;
			when (S3BQ1A4 = 9) S3BQ1A4 = .;	
		if S1Q10A >= 250000 then delete;
	** Dangerous! Only do when you are sure!;
	format S3BQ1A4 AMP. S1Q10A inc. S3BD4Q2E ofn.;
proc sort data=worklib.working;
	by S3BQ1A4 S3BD4Q2E S1Q10B;
** Plot only;
proc freq;
	table S3BQ1A4 S1Q10B S3BD4Q2E /plots=freqplot;

proc corr plot(maxpoints =99999) =scatter ;
	var  S1Q10A S3BD4Q2E;