clear all
set more off

cd "/Users/lydiaassouad/Dropbox/WID_MiddleEast/Assouad2017Lebanon/Assouad2017DistributionSeries/FiscalData"


********************************************************************************
********************************************************************************
************ Cleaning v1 : dropping inconsistent variables  ********************
********************************************************************************
********************************************************************************

use "LebanonRaw_new.dta"

/*
*drop duplicate variables 
*drop BPT_REV SEQ_NO INH*
*We analyze inheritance separately 
*/

* 4704282 observations

* Id
rename YEAR_NO year
rename TAX_ id

* Lump-sum method
rename MAK_TOT selfemplgross
rename MAK_PRF selfemplnet

*S-Corp
rename HAK_TOT corpgross
rename HAK_PRF corpnet

* Partners
rename ASH_TOT partgross
rename ASH_PRF partnet

* wages
rename F3101 wagegross
rename F3505 wagenet

* Rent 
rename BPT_REV rent

* Tax
rename F3605 labortax
rename BPT_TAX renttax
rename INC_TAX profittax

* replace missing values by 0
replace rent=0 if rent==.
replace renttax=0 if renttax==.

*(3258637 real changes made)



/*
1. Turnover variables, wages, rents and taxes cannot be negative
2. Taxable income cannot be greater than gross income (net < gross)
3. Taxes cannot be greater than taxable income 
4. Implausibly high values 
*/ 



/*1. Turnovers of partners and individuals in S-corp, wages (gross and net), 
self-employment income (gross and net), built property revenues and taxes 
cannot be negative
*/

drop if partgross<0
drop if corpgross<0

drop if wagegross<0
drop if wagenet<0

drop if selfemplgross<0
drop if selfemplnet<0

drop if rent <0

drop if labortax <0
drop if renttax<0
drop if profittax <0
 
/*
(154 observations deleted)
(60 observations deleted)
(229 observations deleted)
(16612 observations deleted)
(14 observations deleted)
(11 observations deleted)
(0 observations deleted)
(13533 observations deleted)
(22 observations deleted)
(0 observations deleted)
*/


/*2. Net wages and net self-employment incomes cannot be greater than gross income
Self-employment incomes taxed according to a a lump-sum taxation scheme
Under the lump-sum profit method, the net profit is assessed as a percentage of
the taxpayer’s total income:  selfemplgross = selfemplnet * lump-sum rate 
=> Selfemplgross mustbe greater than Selfemplnet (and positive)

*/ 

drop if wagegross<wagenet
drop if selfemplgross<selfemplnet
drop if corpgross <corpnet
drop if partgross < partnet

/* (19889 observations deleted)
(1253 observations deleted)
(2942 observations deleted)
(9630 observations deleted)
*/

 
/* 3. Amounts of tax levied cannot be greater than taxable income
Profit tax can be levied even if individuals declare negative income flows 
*/

drop if labortax >wagenet
drop if renttax >rent
*(164337  and 1414 observations deleted)

* Profit tax does not distinguish between the type of profit taxed

* Profit taxes can't be positive if there is no profits 
drop if profittax> 0 & corpnet<=0 & selfemplnet==0 & partnet<=0
drop if profittax> 0 & corpgross<=0 & selfemplgross==0 & partgross<=0
* (4926 and 0 observations deleted)

gen profitnet = selfemplnet + corpnet + partnet 
gen profitgross = selfemplgross +corpgross + partgross

drop if profittax>profitgross
*(17 observations deleted)
 
 /*4. Implausibly high values 
* Individual wages between 2% and 200% of GDP 
*/

drop if wagegross >=1412014120000
*(15 observations deleted)

* 4469224 ie 4.99% variables dropped

save Lebanon_cl1.dta, replace

***********************************************************************************************
*  Removing outliers identified (see do_Outliers and ComparisonCleaning.xlsx for their identification)
***********************************************************************************************

local variables wagenet corpnet partnet selfemplnet rent

forval j=2005/2014{
use Lebanon_cl1.dta, clear

foreach x of local variables{
keep if year == `j'
gsort -`x'
sum `x'
gen clean`x'=`x'/r(mean)
}

replace rent=0 if cleanrent>1239
replace wagenet=0 if cleanwage>702
replace selfemplnet=0 if cleanselfemplnet>2267
replace corpnet=0 if cleancorpnet>28709
replace partnet=0 if cleanpartnet>8407

drop clean*

save `j'.dta, replace
}
 
use 2005.dta
forval j=2006/2014{
append using `j'.dta
}
gen income = 1.25*(wagenet + profitnet + rent)
merge m:1 year using "/Users/lydiaassouad/Dropbox/WID_MiddleEast/Assouad2017Lebanon/Assouad2017DistributionSeries/GpinterIncome/LebanonIncomeParameters.dta", nogenerate
rename popsize pop20
drop factor* Lebanon* *Lebanon
save Lebanon_cleaneddrop.dta, replace
erase Lebanon_cl1.dta

forval j=2005/2014{
erase `j'.dta
}


/*
********************************************************************************
********************************************************************************
************ Comparing the effect of the cleaning procedure chosen *************
********************************************************************************
********************************************************************************



clear all

cd "/Users/lydiaassouad/Dropbox/WID_MiddleEast/Assouad2017Lebanon/Assouad2017DistributionSeries/FiscalData/Results"

forval j=2005/2014{
import excel "FiscalSeries.xlsx", sheet("`j'") firstrow clear
save FiscalSeries1`j'.dta, replace
import excel "FiscalSeriesdrop.xlsx", sheet("`j'") firstrow clear
save FiscalSeries2`j'.dta, replace
}



forval j=2005/2014{
clear all
use FiscalSeries2`j'.dta, replace
foreach x of varlist th* b* top* {
rename `x' `x'2
}
save FiscalSeries2`j'.dta, replace
}

clear all
forval j=2005/2014{
use FiscalSeries1`j'.dta
merge 1:1 _n using "FiscalSeries2`j'.dta"
drop _merge
save comparisoncleaning`j'.dta, replace
}

forval j=2005/2014{
foreach num of numlist 1 2 {
erase FiscalSeries`num'`j'.dta
}
}


forval j=2005/2014{
use comparisoncleaning`j'.dta

gen percdiff_thresh = ((thresh2-thresh)/thresh) 
drop thresh thresh2

gen percdiff_bracketavg = ((bracketavg2-bracketavg)/bracketavg) 
drop bracketavg bracketavg2


gen percdiff_topavg = ((topavg2-topavg)/topavg) 
drop topavg topavg2

gen percdiff_b = ((b2-b)/b) 
drop b b2
save comparisoncleaning`j'.dta, replace
}

forval j=2005/2014{
use comparisoncleaning`j'.dta
export excel using "ComparaisonCleaning.xlsx" if year ==`j', sheet("`j'") firstrow(variables) sheetreplace
 }
 
forval j=2005/2014{
 erase comparisoncleaning`j'.dta
 }

 


