


/****************************************************************************************************************************************
		
		Code from raw income tax data to a Gpinter format (to use Pareto interpolation from the Website  http://apps.wid.world/gpinter/)

*****************************************************************************************************************************************/

		

*global projdir "\\ulysse\users\BGarbinti\brtrd\Projet wealth_income\TransfertsDGFiPExcel"
global projdir "C:\Users\G839276\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\IncomeTaxTabulations"
global projdir "C:\Users\Bertrand\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\IncomeTaxTabulations"
global projsimul "C:\Users\Bertrand\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\StataFiles"
cd "$projdir"

*** 1- Getting p90 from raw data
**** The exact p90 is generally absent from the tax tabulation. There are thresholds above and below. 
**** We use Pareto interpolations to compute the p90 threshold (and share) from these adjacent thresholds. It is use after for the computation of the p50 and p10 for ancient year where they are not in the tax tabulations.
global projdir "C:\Users\G839276\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\IncomeTaxTabulations"
global projsimul "C:\Users\G839276\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\StataFiles"
cd "$projdir"


*** 1900 to 2013 (format changes from 2014)
	**** Years and averages 
	foreach year of numlist 1900 1910 1915(1)2013{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A1:B2) firstrow  clear
		keep B
		rename B average
		g year = `year'
		order year average
		save average_`year', replace
	}
	*** thr, p and b
	foreach year of numlist 1900 1910 1915(1)2013{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A3) firstrow  clear
		keep thr p b s
		g year = `year'
		order year thr p b s
		if `year'<1926 & `year'>1910{
			keep if p>0.9
		}
		if inlist(`year',1900,1910){
			keep if p>0.1
		}
		if `year'>=1926 & `year'<1985{
			keep if p>0.5
		}
		if `year'>=1985{
			keep if p>0.1
		}
		drop if missing(p)
		save thr_p_b_`year', replace
	}
	*** Creating temp Excel file
	foreach year of numlist 1900 1910 1915(1)2013{
		use thr_p_b_`year', replace
		merge m:m year using average_`year', nogen	
		sort p
		drop if p==0 & thr ==0 & missing(b) & p[_n-1]==0 & thr[_n-1]==0 & missing(b[_n-1])
		order year average p thr  b
		export excel "$projsimul\SimulationGpinter\rawdata\rawdata_byyeartemp.xlsx", sheet("`year'") firstrow(variables) nolabel sheetmodify
	}
	*** Suppressing temporary files 
		foreach year of numlist 1900 1910 1915(1)2013{
			erase average_`year'.dta
			erase thr_p_b_`year'.dta
		}	
	
	
*** From 2014 onwards
	foreach year of numlist 2014{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A3) firstrow  clear
		order year average p thr  b
		export excel "$projsimul\SimulationGpinter\rawdata\rawdata_byyeartemp.xlsx", sheet("`year'") firstrow(variables) nolabel sheetmodify
	}
	 	
*** 2- Apply gpinter to rawdata_byyeartemp


*** 3- Compute Table TD10temp (in exportresults_DINA_ApD)
******************importing gperc tax unit series for taxable income 1900-2014*******************
global projdir "C:\Users\G839276\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\IncomeTaxTabulations"
global projsimul "C:\Users\G839276\Dropbox\WIDFrance\Papers\GGP2017DINA\GGP2017DINAAppendixD\StataFiles"
cd "$projdir"


	*1900-2014
		clear
		foreach year of numlist 1900 1910 1915(1)2014 {
			import delimited $projsimul/SimulationGpinter/ouput/temp/`year'.csv, rowrange(1) colrange(3)
			gen year=`year'
			di `year'
			cap replace b="." if b=="Inf"
			destring b,replace
			drop topsh
			if `year'>1900 {
				append using "`temp'"
				erase "`temp'"
			}
			tempfile temp
			save "`temp'", replace
			clear
		}
	* Merge
		append using "`temp'"
		erase "`temp'"
		replace p=round(p*100000) 
		sort year p
		cap replace b ="." if b=="Inf"
		destring b, replace
		rename topavg ytop
		save $projsimul/SimulationGpinter/ouput/temp/gperc_fiscalincome_temp_1900_2014.dta, replace	

*************************extracting table D10temp: summary statistics for distribution of taxable income among tax units*
	use $projsimul/SimulationGpinter/ouput/temp/gperc_fiscalincome_temp_1900_2014.dta, clear
	merge m:1 year using $projsimul/temp/averages.dta, nogen 
	rename thr ythr
	keep year p ytop ythr
	sort year p
	gen yint=ytop if p==99999
	replace yint=((100000-p)*ytop-(100000-p[_n+1])*ytop[_n+1])/(p[_n+1]-p) if p<99999
	*codebook yint
	*check for consistency
	replace yint=(ythr+ythr[_n+1])/2 if year==1943 & (yint<ythr | yint>ythr[_n+1]) & year==year[_n+1]
	*--> ok: 0 replacement
	
	gen f=1 if p==99999
	replace f=p[_n+1]-p if p<99999

	sort year p
	gen p0=0 if p==0
	replace p0=p/1000+(1-yint/(ythr[_n+1]-ythr))*f/100000 if ythr==0 & ythr[_n+1]>0 & year[_n+1]==year
	keep if p==0 | p==10000 | p==50000 | p==90000 | p==99000 | p==99900 | p==99990 | p==99999
	sort year p 
	gen ymean=ytop if p==0
	gen bottom10=1-0.9*ytop[_n+1]/ytop if p==0
	gen bottom50=1-0.5*ytop[_n+2]/ytop if p==0
	gen top10=0.1*ytop[_n+3]/ytop if p==0
	gen middle40=1-top10-bottom50 if p==0
	gen top1=0.01*ytop[_n+4]/ytop if p==0
	gen top01=0.001*ytop[_n+5]/ytop if p==0
	gen top001=0.0001*ytop[_n+6]/ytop if p==0
	gen top0001=0.00001*ytop[_n+7]/ytop if p==0
	gen p10=ythr[_n+1]/ymean if p==0
	gen p50=ythr[_n+2]/ymean if p==0
	gen p90=ythr[_n+3]/ymean if p==0
	gen p99=ythr[_n+4]/ymean if p==0
	gen p999=ythr[_n+5]/ymean if p==0
	gen p9999=ythr[_n+6]/ymean if p==0
	gen p99999=ythr[_n+7]/ymean if p==0
	gen b10=ytop[_n+1]/ythr[_n+1] if p==0
	gen b50=ytop[_n+2]/ythr[_n+2] if p==0
	gen b90=ytop[_n+3]/ythr[_n+3] if p==0
	gen b99=ytop[_n+4]/ythr[_n+4] if p==0
	gen b999=ytop[_n+5]/ythr[_n+5] if p==0
	gen b9999=ytop[_n+6]/ythr[_n+6] if p==0
	gen b99999=ytop[_n+7]/ythr[_n+7] if p==0
	keep if p==0
	keep year ymean bottom10 bottom50 middle40 top10 top1 top01 top001 top0001 p0 p10 p50 p90 p99 p999 p9999 p99999 b10 b50 b90 b99 b999 b9999 b99999
	order year ymean bottom10 bottom50 middle40 top10 top1 top01 top001 top0001 p0 p10 p50 p90 p99 p999 p9999 p99999 b10 b50 b90 b99 b999 b9999 b99999
	
	replace bottom10 = . /*if  year < 1985*/
	replace bottom50 = . if  year < 1985
	replace middle40 = . if  year < 1985
	replace p0 = . if  year < 1985 
	replace p10 = . /*if  year < 1985*/
	replace p50 = . if  year < 1985
	
	save $projsimul/temp/TableD10temp.dta, replace
	export excel using $projsimul/temp/exportresults_DINA_ApD.xlsx, sheet("TD10temp") sheetmodify firstrow(variables)

*** To get p10 and p50, following Piketty (2001, 1998) (For a complete explanation about methodology, see Piketty 1998, appendix B) 
*** 4- Open GGP2017DinaAppendixD and exportresults_DINA_ApD, to update table TD3 and save it ***
*** 5- Open GGP2017IncomeTaxTabulations and GGP2017DinaAppendixD, to update tabulations (thanks to the update of table TD3) and save it ***

*** 6- Getting the final annual tabulations ***
	**** Years and averages 
	foreach year of numlist 1900 1910 1915(1)2013{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A1:B2) firstrow  clear
		keep B
		rename B average
		g year = `year'
		order year average
		save average_`year', replace
	}	
	**** Average share of singles	
		foreach year of numlist 1900 1910 1915(1)2013{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(C1:D2) firstrow  clear
		keep D
		rename D singleshare
		g year = `year'
		order year singleshare
		save singleshare_`year', replace
	}
	**** thr, p and b
	foreach year of numlist 1900 1910 1915(1)2013{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A3) firstrow  clear
		keep thr p b s
		keep if p>0
		g year = `year'
		order year thr p b s
		drop if missing(p)
		save thr_p_b_`year', replace
	}
	*** Creating final Excel file
	foreach year of numlist 1900 1910 1915(1)2013{
		use thr_p_b_`year', replace
		merge m:m year using average_`year', nogen	
		merge m:m year using singleshare_`year', nogen	
		replace singleshare=. if _n>1
		sort p
		drop if p==0 & thr ==0 & missing(b) & p[_n-1]==0 & thr[_n-1]==0 & missing(b[_n-1])
		order year average p thr b s singleshare
		
		di "`year'"
		export excel "$projsimul\SimulationGpinter\rawdata\rawdata_byyear.xlsx", sheet("`year'") firstrow(variables) nolabel sheetmodify
	}
	*** Suppressing temporary files 
		foreach year of numlist 1900 1910 1915(1)2013{
			erase average_`year'.dta
			erase singleshare_`year'.dta
			erase thr_p_b_`year'.dta
		}	
	
	
*** From 2014 onwards
	foreach year of numlist 2014{
		import excel "GGP2017IncomeTaxTabulations.xlsx", sheet("`year'")   cellrange(A3) firstrow  clear
		keep if p>0
		order year average p thr b s singleshare
		replace singleshare=. if _n>1
		di "`year'"
		export excel "$projsimul\SimulationGpinter\rawdata\rawdata_byyear.xlsx", sheet("`year'") firstrow(variables) nolabel sheetmodify
	}


*** 7- Apply gpinter to rawdata_byyear  ***




