/*******************************************************************************
	
		validation.do
		10.19.2025
		Validation analysis for:
		"The Value of Internal Labor Markets: Evidence from LinkedIn Profiles and U.S. Inventors. "
		Letian (LT) Zhang and Simeng Wang
		
*******************************************************************************/


**# Indeed
{
use indeed_us5.dta 
//load indeed files at the review level
keep indeed_company_id year security_rating benefits_rating culture_rating management_rating work_life_rating
drop if missing(indeed_company_id)|missing(year)

foreach var of varlist *rating{
	bysort indeed_company_id year: egen `var'_ey=mean(`var')
	//Aggregate to company-year means for each rating

}

bysort indeed_company_id year: drop if _n!=1
//Keep one row per company-year (post-aggregation)

merge 1:m indeed_company_id year using "all_inventor_linkedin_match9_analysis.dta"
drop if _merge==1
drop _merge
save"all_inventor_linkedin_match9_analysis.dta",replace

cap drop promotion_rate4_ey
gen promotion_rate4_ey=(promotion_raw+total_promotion)/(total_inventor_year3+total_noninventor3) 
//calculate the promotion rate among both inventors +non-inventors
gen external_rate4_ey=(external_hiring_raw4+total_external)/(total_inventor_year3+total_noninventor3) 
//calculate the external hiring rate among both inventors +non-inventors
bysort employer_id2 year:drop if _n!=1

binscatter security_rating_ey promotion_rate4_ey   if count_indeed_review>=20&total_inventor_year3>=1, ytitle(Indeed Rating of Job Security & Advancement,size(med)) ylabel(,nogrid) xlabel(,nogrid) xtitle(Num. Internally Promoted Employees/Num. Employees,size(med)) n(20)
graph export "indeed_validation.png", as(png) name("Graph")

binscatter security_rating_ey external_rate4_ey   if count_indeed_review>=20&total_inventor_year3>=1, ytitle(Indeed Rating of Job Security & Advancement,size(med)) ylabel(,nogrid) xlabel(,nogrid) xtitle(Num. Externally Hired Employees/Num. Employees,size(med)) n(20)
graph export "indeed_validation2.png", as(png) name("Graph")
}

**# LightCast
{
use "bgt_new_linkedin_exact_match.dta" //crosswalk between lightcast and linkedin firm names
keep employer_id2 company_raw clean_bgt_company_id
drop if missing(company_raw)
recast str company_raw
merge 1:m company_raw using  "BGT_US_Main_1023.dta"
drop if _merge==1
drop _merge
replace manager=0 if missing(manager)
save "BGT_US_Main_1023.dta",replace
bysort employer_id2 year: egen total_manager_position=total(manager)
bysort employer_id2 year: gen total_position=_N
save "BGT_US_Main_1023.dta",replace


keep employer_id2 year total_manager_position total_position 
bysort employer_id2 year: drop if _n!=1
merge 1:1 employer_id2 year using "linkedin_ratio.dta" 
//linkedin promotion rate for each firm-year (including both inventor and non-inventors)
drop if _merge==1
drop _merge 
save  "linkedin_ratio.dta",replace

gen measure1=(total_position-total_manager_position)/total_manager_position 
//prop. of non-managerial job postings/ prop. of managerial job postings
gen lmeasure1=ln(measure1+1)
gen ratio=internal_hire/external_hire 
//internal hiring rate divided by external hiring rate
gen lratio=ln(ratio+1)
gen diff=abs((total_external_hire)- total_position) 
//total number of external hires on LinkedIn for each firm-year minus the total number of job postings for the same firm-year. This measures the extent to which firms actually advertise their openings on hiring platforms.
binscatter lratio lmeasure1 if total>=5&diff<30 
}

**# NYC Open Data
{
import delimited using "Citywide_Payroll_Data__Fiscal_Year__20251010.csv", bindquote(strict) clear
gen agencyname2="New York City"+" "+agencyname
replace agencyname2=subinstr(agencyname2,"NYC","New York City",.)
replace agencyname2=subinstr(agencyname2,"DEPT","Department",.)
replace agencyname2=lower(agencyname2)
replace agencyname2=strtrim(agencyname2)
replace agencyname2=stritrim(agencyname2)
replace agencyname2 = ustrregexra(agencyname2, "#[0-9]+", "")
replace agencyname2=strtrim(agencyname2)
replace agencyname2=stritrim(agencyname2)
egen agencyname2_id=group(agencyname2)
egen individual_id=group(lastname firstname midinit agencyname2_id)
replace individual_id=. if missing(lastname)
save "nyc_data.dta",replace
cap drop promotion_rate promotion
sort individual_id fiscalyear
by individual_id: gen promotion=1 if title!=title[_n-1]&_n!=1
replace promotion=0 if missing(promotion)
replace promotion=. if missing(individual_id)
bysort fiscalyear: egen promotion_rate=mean(promotion)
save "nyc_data.dta",replace

**# match agency name to linkedin 
{

//round1
use "nyc_agency.dta",replace
gen agencyname3=agencyname2
replace agencyname3=subinstr(agencyname2,"new york city","nyc",.)
replace agencyname3=strtrim(agencyname3)
replace agencyname3=stritrim(agencyname3)
save "nyc_agency.dta",replace

rename agencyname3 clean_rcid_firm_name 
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta" //this file stores the cleaned linkedin firm names
keep if _merge==3
drop _merge
keep agencyname2_id employer_id2
save "nyc_linkedin_firm_match.dta",replace


//round2
use "nyc_agency.dta",replace
rename agencyname2 clean_rcid_firm_name 
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta"
keep if _merge==3
drop _merge
keep agencyname2_id employer_id2
append using "nyc_linkedin_firm_match.dta"
duplicates drop 
save "nyc_linkedin_firm_match.dta",replace
keep agencyname2_id 
duplicates drop 
merge 1:1 agencyname2_id using  "nyc_agency.dta"
gen matched_to_linkedin=1 if _merge==3
drop _merge
save "nyc_agency.dta",replace

//round3
gen agencyname4=agencyname2 
replace agencyname4=subinstr(agencyname2,"new york city new york city","new york city",.)
save "nyc_agency.dta",replace
rename agencyname4 clean_rcid_firm_name 
keep if matched_to_linkedin!=1
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta"
keep if _merge==3
drop _merge
keep agencyname2_id employer_id2
append using "nyc_linkedin_firm_match.dta"
duplicates drop 
save "nyc_linkedin_firm_match.dta",replace
keep agencyname2_id 
duplicates drop 
merge 1:1 agencyname2_id using  "nyc_agency.dta"
replace matched_to_linkedin=1 if _merge==3
drop _merge
save "nyc_agency.dta",replace


//round4
gen agencyname5 = agencyname4

* manual replacements
replace agencyname5 = "nyc administration for children's services" if regexm(agencyname4, "admin.*children")
replace agencyname5 = "nyc office of administrative trials and hearings (oath)" if regexm(agencyname4, "admin trials|hearings")
replace agencyname5 = "nyc board of correction" if regexm(agencyname4, "board of correct")
replace agencyname5 = "nyc board of elections" if regexm(agencyname4, "board of election")
replace agencyname5 = "office of the bronx borough president" if regexm(agencyname4, "borough president.*bronx")
replace agencyname5 = "office of the brooklyn borough president" if regexm(agencyname4, "borough president.*brooklyn")
replace agencyname5 = "office of the queens borough president" if regexm(agencyname4, "borough president.*queens")
replace agencyname5 = "office of the staten island borough president" if regexm(agencyname4, "borough president.*staten")
replace agencyname5 = "new york city council" if regexm(agencyname4, "city council")
replace agencyname5 = "nyc department of consumer and worker protection" if regexm(agencyname4, "consumer.*(affair|protection)")
replace agencyname5 = "nyc department of cultural affairs" if regexm(agencyname4, "cultural affairs")
replace agencyname5 = "nyc department of citywide administrative services (dcas)" if regexm(agencyname4, "citywide admin")
replace agencyname5 = "nyc department of education" if regexm(agencyname4, "department of ed")
replace agencyname5 = "nyc department of environmental protection" if regexm(agencyname4, "environment")
replace agencyname5 = "nyc department of health and mental hygiene" if regexm(agencyname4, "health|mental hygiene")
replace agencyname5 = "nyc office of technology and innovation" if regexm(agencyname4, "info tech|telecomm|technology")
replace agencyname5 = "nyc department of parks and recreation" if regexm(agencyname4, "parks")
replace agencyname5 = "nyc department of homeless services" if regexm(agencyname4, "homeless")
replace agencyname5 = "nyc department of housing preservation and development (hpd)" if regexm(agencyname4, "housing preservation|dvlpmnt")
replace agencyname5 = "nyc human resources administration / department of social services" if regexm(agencyname4, "hra|social services")
replace agencyname5 = "nyc commission on human rights" if regexm(agencyname4, "human rights")
replace agencyname5 = "nyc landmarks preservation commission" if regexm(agencyname4, "landmarks preservation")
replace agencyname5 = "nyc mayor's office of contract services" if regexm(agencyname4, "mayors office.*contract")
replace agencyname5 = "nyc office of management and budget" if regexm(agencyname4, "management.*budget")
replace agencyname5 = "office of the new york city comptroller" if regexm(agencyname4, "comptroller")
replace agencyname5 = "office of the mayor of new york city" if regexm(agencyname4, "office of the mayor")
replace agencyname5 = "nyc taxi and limousine commission (tlc)" if regexm(agencyname4, "taxi|limousine")
replace agencyname5 = "teachers' retirement system of the city of new york (trsnyc)" if regexm(agencyname4, "teachers retirement")
replace agencyname5 = "new york city housing authority (nycha)" if regexm(agencyname4, "housing authority")
replace agencyname5 = "nyc department of youth and community development (dycd)" if regexm(agencyname4, "youth|community dev")
replace agencyname5 = strtrim(agencyname5)
replace agencyname5 = stritrim(agencyname5)
save "nyc_agency.dta",replace

keep if matched_to_linkedin!=1
keep agencyname5 
rename agencyname5 clean_rcid_firm_name 
duplicates drop 
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta"
keep if _merge==3
drop _merge
keep clean_rcid_firm_name employer_id2
rename clean agencyname5
merge 1:m agencyname5 using  "nyc_agency.dta"
keep if _merge==3
keep agencyname2_id employer_id2 
duplicates drop
append using "nyc_linkedin_firm_match.dta"
duplicates drop 
save "nyc_linkedin_firm_match.dta",replace
keep agencyname2_id 
duplicates drop 
merge 1:1 agencyname2_id using  "nyc_agency.dta"
replace matched_to_linkedin=1 if _merge==3
drop _merge
save "nyc_agency.dta",replace


//round5
gen agencyname6 = agencyname5
replace agencyname6 = "office of the bronx borough president" if regexm(lower(agencyname5), "borough president.*bronx")
replace agencyname6 = "office of the brooklyn borough president" if regexm(lower(agencyname5), "borough president.*brooklyn|borough president.*kings")
replace agencyname6 = "office of the queens borough president" if regexm(lower(agencyname5), "borough president.*queens")
replace agencyname6 = "office of the staten island borough president" if regexm(lower(agencyname5), "borough president.*staten")
replace agencyname6 = "new york city council" if regexm(lower(agencyname5), "city council")
replace agencyname6 = "office of the city clerk - nyc" if regexm(lower(agencyname5), "city clerk")
replace agencyname6 = "nyc civil service commission" if regexm(lower(agencyname5), "civil service commission")
replace agencyname6 = "civilian complaint review board (ccrb)" if regexm(lower(agencyname5), "civilian complaint review")
replace agencyname6 = "nyc conflicts of interest board" if regexm(lower(agencyname5), "conflicts of interest board")
replace agencyname6 = "nyc board of elections" if regexm(lower(agencyname5), "board of election")
replace agencyname6 = "nyc board of correction" if regexm(lower(agencyname5), "board of correct")
replace agencyname6 = "nyc districting commission" if regexm(lower(agencyname5), "districting commission")
replace agencyname6 = "nyc landmarks preservation commission" if regexm(lower(agencyname5), "landmarks preservation")
replace agencyname6 = "nyc tax commission" if regexm(lower(agencyname5), "tax commission")
replace agencyname6 = "nyc equal employment practices commission" if regexm(lower(agencyname5), "equal employ.*practices")
replace agencyname6 = "nyc office of administrative trials and hearings (oath)" if regexm(lower(agencyname5), "admin(istrative)? trials|oath")
replace agencyname6 = "nyc mayor's office of contract services" if regexm(lower(agencyname5), "mayor'?s office.*contract")
replace agencyname6 = "mayor's office of criminal justice (mocj)" if regexm(lower(agencyname5), "office of criminal justice")
replace agencyname6 = "nyc emergency management" if regexm(lower(agencyname5), "office of emergency management")
replace agencyname6 = "nyc office of labor relations" if regexm(lower(agencyname5), "office of labor relations")
replace agencyname6 = "nyc office of management and budget" if regexm(lower(agencyname5), "office of management.*budget|^nyc office of management and budget$")
replace agencyname6 = "office of the new york city comptroller" if regexm(lower(agencyname5), "office of .*comptroller")
replace agencyname6 = "office of the mayor of new york city" if regexm(lower(agencyname5), "office of the mayor")
replace agencyname6 = "nyc office of racial equity" if regexm(lower(agencyname5), "office of racial equity")
replace agencyname6 = "nyc commission on racial equity" if regexm(lower(agencyname5), "commission on racial equity")
replace agencyname6 = "nyc office of technology and innovation" if regexm(lower(agencyname5), "office of technology and innovation|info tech|telecomm|technology")
replace agencyname6 = "nyc department of buildings" if regexm(lower(agencyname5), "department of buildings")
replace agencyname6 = "nyc department of design and construction (ddc)" if regexm(lower(agencyname5), "department.*design.*construction")
replace agencyname6 = "nyc department of small business services" if regexm(lower(agencyname5), "department of business serv|small business services")
replace agencyname6 = "nyc department of citywide administrative services (dcas)" if regexm(lower(agencyname5), "citywide admin")
replace agencyname6 = "nyc department of education" if regexm(lower(agencyname5), "department of ed|^nyc department of education$")
replace agencyname6 = "nyc department of environmental protection" if regexm(lower(agencyname5), "department of environ")
replace agencyname6 = "nyc department of health and mental hygiene" if regexm(lower(agencyname5), "department of health|mental hygiene")
replace agencyname6 = "nyc department of parks and recreation" if regexm(lower(agencyname5), "department of parks|parks & recreation|parks and recreation")
replace agencyname6 = "nyc department of probation" if regexm(lower(agencyname5), "department of probation")
replace agencyname6 = "nyc department of records and information services (doris)" if regexm(lower(agencyname5), "records.*info(rmation)? service")
replace agencyname6 = "nyc department of youth and community development (dycd)" if regexm(lower(agencyname5), "youth.*community.*dev")
replace agencyname6 = "nyc department of homeless services" if regexm(lower(agencyname5), "department.*homeless")
replace agencyname6 = "nyc department of housing preservation and development (hpd)" if regexm(lower(agencyname5), "housing preservation|\\(hpd\\)")
replace agencyname6 = "nyc department of consumer and worker protection" if regexm(lower(agencyname5), "consumer.*(affair|protection)")
replace agencyname6 = "nyc human resources administration / department of social services" if regexm(lower(agencyname5), "hra|department of social services")
replace agencyname6 = "nyc taxi and limousine commission (tlc)" if regexm(lower(agencyname5), "taxi|limousine")
replace agencyname6 = "teachers' retirement system of the city of new york (trsnyc)" if regexm(lower(agencyname5), "teachers'? retirement")
replace agencyname6 = "nyc employees' retirement system (nycers)" if regexm(lower(agencyname5), "employees? retirement sys")
replace agencyname6 = "nyc fire pension fund" if regexm(lower(agencyname5), "fire pension fund")
replace agencyname6 = "nyc municipal water finance authority" if regexm(lower(agencyname5), "municipal water fin")
replace agencyname6 = "financial information services agency (fisa)" if regexm(lower(agencyname5), "financial info.*svcs agency|^new york city financial info svcs agency$")
replace agencyname6 = "office of payroll administration (opa)" if regexm(lower(agencyname5), "off(ice)? of payroll admin")
replace agencyname6 = "office of collective bargaining (ocb)" if regexm(lower(agencyname5), "office of collective bargain")
replace agencyname6 = "office of the new york city public advocate" if regexm(lower(agencyname5), "public advocate")
replace agencyname6 = "nyc public service corps" if regexm(lower(agencyname5), "public service corps")
replace agencyname6 = "bronx community board" if regexm(lower(agencyname5), "bronx community board")
replace agencyname6 = "brooklyn community board" if regexm(lower(agencyname5), "brooklyn community board")
replace agencyname6 = "manhattan community board" if regexm(lower(agencyname5), "manhattan community board|staten island community bd==no")  /* keep manhattan only here */
replace agencyname6 = "queens community board" if regexm(lower(agencyname5), "queens community board")
replace agencyname6 = "staten island community board" if regexm(lower(agencyname5), "staten island community (bd|board)")
replace agencyname6 = "bronx district attorney's office" if regexm(lower(agencyname5), "bronx (da|district attorney)")
replace agencyname6 = "kings county district attorney's office" if regexm(lower(agencyname5), "(kings|brooklyn) (da|district attorney)")
replace agencyname6 = "queens district attorney's office" if regexm(lower(agencyname5), "(queens|qns) (da|district attorney)")
replace agencyname6 = "new york county district attorney's office (manhattan da)" if regexm(lower(agencyname5), "(manhattan|new york county).*(da|district attorney)")
replace agencyname6 = "richmond county district attorney's office (staten island da)" if regexm(lower(agencyname5), "richmond.*(da|district attorney)|staten.*(da|district attorney)")
replace agencyname6 = "office of the special narcotics prosecutor (nyc)" if regexm(lower(agencyname5), "special narc")
replace agencyname6 = "new york city housing authority (nycha)" if regexm(lower(agencyname5), "housing authority \\(nycha\\)|^new york city housing authority \\(nycha\\)$|^nycha$")
replace agencyname6 = "nyc department of veterans' services" if regexm(lower(agencyname5), "department of veterans")
replace agencyname6 = "president, borough of manhattan" if regexm(lower(agencyname5), "president borough of manhattan")
replace agencyname6 = "nyc personnel monitors" if regexm(lower(agencyname5), "personnel monitors")
replace agencyname6 = "nyc public administrator (bronx)" if regexm(lower(agencyname5), "public administrator-?bronx")
replace agencyname6 = "nyc public administrator (kings)" if regexm(lower(agencyname5), "public administrator-?kings")
replace agencyname6 = "nyc public administrator (new york)" if regexm(lower(agencyname5), "public administrator-?new york")
replace agencyname6 = "nyc public administrator (queens)" if regexm(lower(agencyname5), "public administrator-?queens")
replace agencyname6 = "nyc public administrator (richmond)" if regexm(lower(agencyname5), "public administrator-?richmond")
replace agencyname6 = "cuny central office" if regexm(lower(agencyname5), "cuny central office")
replace agencyname6 = "hunter college high school" if regexm(lower(agencyname5), "hunter college high school")
replace agencyname6 = "bronx community college (cuny)" if regexm(lower(agencyname5), "community college \\(bronx\\)")
replace agencyname6 = "hostos community college (cuny)" if regexm(lower(agencyname5), "community college \\(hostos\\)")
replace agencyname6 = "kingsborough community college (cuny)" if regexm(lower(agencyname5), "community college \\(kingsboro(w)?\\)")
replace agencyname6 = "laguardia community college (cuny)" if regexm(lower(agencyname5), "community college \\(laguardia\\)")
replace agencyname6 = "borough of manhattan community college (bmcc) (cuny)" if regexm(lower(agencyname5), "community college \\(manhattan\\)")
replace agencyname6 = "queensborough community college (cuny)" if regexm(lower(agencyname5), "community college \\(queensboro(w)?\\)")
replace agencyname6 = "stella and charles guttman community college (cuny)" if regexm(lower(agencyname5), "guttman community college")
replace agencyname6 = "nyc administration for children's services" if regexm(lower(agencyname5), "^nyc administration for children'?s services$")
replace agencyname6 = "nyc board of correction" if regexm(lower(agencyname5), "^nyc board of correction$")
replace agencyname6 = "nyc board of elections" if regexm(lower(agencyname5), "^nyc board of elections$")
replace agencyname6 = "nyc office of technology and innovation" if regexm(lower(agencyname5), "^nyc office of technology and innovation$")
replace agencyname6 = "teachers' retirement system of the city of new york (trsnyc)" if regexm(lower(agencyname5), "^teachers'? retirement system of the city of new york \\(trsnyc\\)$")
replace agencyname6 = strtrim(agencyname6)
replace agencyname6 = stritrim(agencyname6)
save "nyc_agency.dta",replace

keep if matched_to_linkedin!=1
keep agencyname6 

rename agencyname6 clean_rcid_firm_name 
duplicates drop 
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta"
keep if _merge==3
drop _merge
keep clean_rcid_firm_name employer_id2
rename clean agencyname6
merge 1:m agencyname6 using  "nyc_agency.dta"
keep if _merge==3
keep agencyname2_id employer_id2 
duplicates drop
append using "nyc_linkedin_firm_match.dta"
duplicates drop 

save "nyc_linkedin_firm_match.dta",replace
keep agencyname2_id 

duplicates drop 
merge 1:1 agencyname2_id using  "nyc_agency.dta"
replace matched_to_linkedin=1 if _merge==3
drop _merge
save "nyc_agency.dta",replace

//round6
gen agencyname8=agencyname6
replace agencyname8="new york city buildings dept" if agencyname6=="nyc department of buildings"
replace agencyname8="new york city comptroller's office" if agencyname6=="office of the new york city comptroller"
replace agencyname8="new york city department of consumer affairs" if agencyname6=="nyc department of consumer and worker protection"
replace agencyname8="new york city department of consumer affairs" if agencyname6=="nyc department of consumer and worker protection"
replace agencyname8="new york city department of correction" if agencyname6=="nyc board of correction"
replace agencyname8="new york city department of correction" if agencyname6=="nyc board of correction"
replace agencyname8="new york city department of design construction" if agencyname6=="nyc department of design and construction (ddc)"
replace agencyname8="new york city department of health mental hygiene" if agencyname6=="nyc department of health and mental hygiene"
replace agencyname8="new york city department of homeless services" if agencyname6=="nyc department of homeless services"
replace agencyname8="new york city department of housing preservation and development" if agencyname7=="nyc department of housing preservation and development (hpd)"
replace agencyname8="new york city employees' retirement system" if agencyname6=="nyc employees' retirement system (nycers)"
replace agencyname8="new york city human resources administration" if agencyname6=="nyc human resources administration / department of social services"
replace agencyname8="new york city human resources administration" if agencyname6=="nyc landmarks preservation commission"
replace agencyname8="new york city taxi limousine" if agencyname6=="nyc taxi and limousine commission (tlc)"
replace agencyname8="nyc department of veterans services" if agencyname6=="nyc department of veterans' services"
replace agencyname8="nyc department probation" if agencyname6=="nyc civil service commission"
replace agencyname8="nyc department probation" if agencyname6=="nyc department of probation"
replace agencyname8="nyc housing authority" if agencyname6=="new york city housing authority (nycha)"
replace agencyname8="nyc office of management budget" if agencyname6=="nyc office of management and budget"
replace agencyname8="nyc office of payroll administration" if agencyname6=="office of payroll administration (opa)"
replace agencyname8="nyc racial justice commission" if agencyname6=="nyc commission on racial equity"
replace agencyname8="nyc racial justice commission" if agencyname6=="nyc office of racial equity"
replace agencyname8="nyc records" if agencyname7=="nyc department of records and information services (doris)"
replace agencyname8="office of the mayor of new york city" if agencyname6=="nyc office of the mayor"
replace agencyname8="office of the mayor of new york city" if agencyname6=="nyc office of the mayor"
replace agencyname8="the new york city council" if agencyname7=="new york city council"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of education" if agencyname6=="nyc department of education"
replace agencyname8="the new york city department of youth community development" if agencyname6=="nyc department of youth and community development (dycd)"
save "nyc_agency.dta",replace

keep if matched_to_linkedin!=1
keep agencyname8

rename agencyname8 clean_rcid_firm_name 
duplicates drop 
merge 1:m clean_rcid_firm_name using "clean_rcid_id_name.dta"
keep if _merge==3
drop _merge
keep clean_rcid_firm_name employer_id2
rename clean agencyname8
merge 1:m agencyname8 using  "nyc_agency.dta"
keep if _merge==3
keep agencyname2_id employer_id2 
duplicates drop
append using "nyc_linkedin_firm_match.dta"
duplicates drop 
save "nyc_linkedin_firm_match.dta",replace
keep agencyname2_id 

duplicates drop 
merge 1:1 agencyname2_id using  "nyc_agency.dta"
replace matched_to_linkedin=1 if _merge==3
drop _merge
save "nyc_agency.dta",replace
}

use employer_id2 user_id year start_month promotion_current using "experience_final_update_2025_expand_us.dta"
gen start_fiscal_year = year + (start_month >= 7)
save  "fiscal_year_expand_us.dta"

use "nyc_linkedin_firm_match.dta"
keep employer_id2 
duplicates drop 
replace employer_id2 = 4774704 in 48 //"city of new york"
merge 1:m employer_id2 using "fiscal_year_expand_us.dta"
keep if _merge==3
drop _merge
save  "fiscal_year_expand_us2.dta"

bysort employer_id2 start_fiscal_year user_id: gen mark=1 if _n==1
bysort start_fiscal_year: egen total_mark2=total(mark)
replace promotion_current=0 if missing(promotion_current)
bysort  start_fiscal_year: egen promotion_rate_ey_linkedin2=total(promotion_current)
replace promotion_rate_ey_linkedin2=promotion_rate_ey_linkedin2/total_mark2
keep start_fiscal_year promotion_rate_ey_linkedin2
duplicates drop 
rename start_fiscal_year fiscalyear
merge 1:m fiscalyear using  "nyc_data.dta"
drop if _merge==1
drop _merge
save "nyc_data.dta",replace
keep fiscalyear promotion_rate  promotion_rate_ey_linkedin2
bysort fiscalyear: drop if _n!=1
keep if fiscalyear>2014&fiscalyear<2025
twoway line promotion_rate fiscalyear || line promotion_rate_ey_linkedin2 fiscalyear

}


**# ACS
{
//industry-year: demographic composition
{
use user_id year naics_code2 naics_final2 using "experience_final_update_2025_expand_us.dta", clear //linkedin career histories file at the individual-year level
rename user_id id2
merge m:1 id2 using pic_race_gender_1218.dta //picture-based race (white and black)
drop if _merge==2
drop _merge
save "experience_final_update_2025_expand_us.dta"

use user_id f_prob api_prob hispanic_prob using "user_id_main.dta", clear //name-based gender and race (asian and hispanic)
rename user_id id2
merge 1:m id2 using  "experience_final_update_2025_expand_us.dta"
drop if _merge==1
drop _merge
save "experience_final_update_2025_expand_us.dta",replace

bysort id2 naics_final2 year: gen mark=1 if _n==1
keep if mark==1
foreach var of varlist *mean2{
	bysort naics_final2 year: egen prop_`var'=mean(`var')  //prop of black or white workers for each naics 2-digit -year group.
}

foreach var of varlist *_prob{
	bysort naics_final2 year: egen prop_`var'=mean(`var') //prop of asian, female, hispanic workers for each naics 2-digit -year group.
}

save "experience_final_update_2025_expand_us.dta",replace
keep naics_final2 year prop_*
bysort naics_final2 year: drop if _n!=1
save "/acs/naics_year.dta",replace //this file contains demographic composition for each industry-year level based on LinkedIn data

use "/acs/acs.dta"
gen white=1 if race==1
gen black=1 if race==2
gen asian=1 if race==4|race==5|race==6
gen female=1 if sex==2
gen hispanic=1 if hispan!=0
foreach var of varlist white-female hispanic{
	replace `var'=0 if missing(`var')
}
gen naics2=substr(indnaics,1,2) if length(indnaics)>=2
foreach var of varlist white-female hispanic{
	bysort naics2 year: egen prop_`var'_acs=mean(`var')
}
save "/acs/acs.dta",replace
keep naics2 year prop_*
bysort naics2 year: drop if _n!=1
decode year, gen(year2)
drop year
rename year2 year
destring year, replace
save "/acs/acs_naics_year.dta",replace //this file contains demographic composition for each industry-year level based on ACS data


use "/acs/naics_year.dta",replace
rename naics naics2 
tostring naics2, replace
merge 1:1 naics2 year using "/acs/acs_naics_year.dta"
keep if _merge==3
drop _merge
drop if missing(naics2)
save "/acs/match_naics_year.dta"

corr prop_hispanic_prob prop_hispanic_acs //hispanic 
binscatter  prop_hispanic_acs prop_hispanic_prob,xtitle("Proportion of Hispanic Workers (from LinkedIn)") ytitle("Proportion of Hispanic Workers (from ACS)") xlabel(,nogrid)

corr prop_black_mean2 prop_black_acs //black
binscatter  prop_black_acs prop_black_mean2,xtitle("Proportion of Black Workers (from LinkedIn)") ytitle("Proportion of Black Workers (from ACS)") xlabel(,nogrid)

corr prop_white_mean2 prop_white_acs //white
binscatter  prop_white_acs prop_white_mean2,xtitle("Proportion of White Workers (from LinkedIn)") ytitle("Proportion of White Workers (from ACS)") xlabel(,nogrid)

corr prop_api_prob prop_asian_acs //asian
binscatter  prop_asian_acs prop_api_prob,xtitle("Proportion of Asian Workers (from LinkedIn)") ytitle("Proportion of Asian Workers (from ACS)") xlabel(,nogrid)

corr prop_f_prob prop_female_acs //women
binscatter  prop_female_acs prop_f_prob,xtitle("Proportion of Women Workers (from LinkedIn)") ytitle("Proportion of Women Workers (from ACS)") xlabel(,nogrid)
}
//industry-year: occupational composition
{
use soccode_final year user_id naics_code2 naics_final2 using "experience_final_update_2025_expand_us.dta", clear
drop if missing(soccode_final)|missing(naics_code2)
decode soccode_final,gen(soccode_final2)
drop soccode_final 
save "experience_final_update_2025_expand_us.dta"

use "title.dta",replace //crosswalk between EEO Occupation categories and SOC codes
keep eeo_title_categories soccode_final2
drop if missing(eeo_title_categories)
duplicates drop
merge 1:m soccode_final2 using "experience_final_update_2025_expand_us.dta"
drop if _merge==1
drop _merge
save "experience_final_update_2025_expand_us.dta",replace

replace eeo=3 if eeo==4 //combine "Excutive & senior managers" with "First/middle managers" as the case in EEO-1 Restrict-level data.
keep eeo_title_categories naics_final2 user_id year
bysort user_id eeo_title naics year: gen mark=1 if _n==1
keep if mark==1
bysort user_id naics year: gen mark2=1 if _n==1
bysort  eeo_title naics year: egen total_mark=total(mark)
bysort   naics year: egen total_mark2=total(mark2)
gen prop_linkedin_occ=total_mark/total_mark2
keep naics year eeo_title_categories prop_linkedin_occ
bysort naics year eeo: drop if _n!=1
drop if missing(naics)|missing(eeo)
save "/acs/linkedin_naics_occ_year.dta" ////this file contains occupational composition for each industry-year level based on LinkedIn data


use  "/acs/acs.dta", replace
gen eeo_title_categories=3 if strpos(eeo_occ,"Exec")!=0
replace eeo_title_categories=4 if strpos(eeo_occ,"First")!=0
replace eeo_title_categories=7 if strpos(eeo_occ,"Professional")!=0
replace eeo_title_categories=10 if strpos(eeo_occ,"Tech")!=0
replace eeo_title_categories=1 if strpos(eeo_occ,"Support")!=0
replace eeo_title_categories=9 if strpos(eeo_occ,Service")!=0
replace eeo_title_categories=5 if strpos(eeo_occ,"Helper")!=0
replace eeo_title_categories=8 if strpos(eeo_occ,"Sale")!=0
replace eeo_title_categories=2 if strpos(eeo_occ,"Craft")!=0
replace eeo_title_categories=6 if strpos(eeo_occ,"Opera")!=0
save "/acs/acs.dta", replace
replace eeo_title_categories=3 if eeo_title_categories==4
gen naics2=substr(indnaics,1,2)
bysort naics2 year: gen total=_N
bysort naics2 year eeo_title_categories: gen total_haha=_N
keep naics2 year eeo_title_categories total total_haha
drop if missing(naics2)|missing(eeo_title)
duplicates drop
gen prop_acs_occ=total_haha/total
save "/acs/acs_naics_occ_year.dta"////this file contains occupational composition for each industry-year level based on ACS data


use  "/acs/acs_naics_occ_year.dta"
rename naics2 naics_final2
keep naics_final2 year eeo_title_categories prop_*
decode year,gen(year2)
drop year
rename year2 year
destring year,replace
destring naics, replace force
drop if missing(naics)
merge 1:1 naics_final2 year eeo using "/acs/linkedin_naics_occ_year.dta"
keep if _merge==3
drop _merge
save "/acs/match_naics_occ_year.dta"

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==3&year>=2000 //manager
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==3&year>=2000, xtitle("Proportion of Managers (from LinkedIn)") ytitle("Proportion of Managers (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==1&year>=2000 //admin support
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==1&year>=2000, xtitle("Proportion of Administrative Support Workers  (from LinkedIn)") ytitle("Proportion of Administrative Support Workers  (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==2&year>=2000 //craft
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==2&year>=2000, xtitle("Proportion of Craft Workers  (from LinkedIn)") ytitle("Proportion of Craft Workers  (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==5&year>=2000 //labors
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==5&year>=2000, xtitle("Proportion of Labors and Helpers (from LinkedIn)") ytitle("Proportion of  Labors and Helpers (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==6&year>=2000 //operative
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==6&year>=2000, xtitle("Proportion of Operatives (from LinkedIn)") ytitle("Proportion of Operatives (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==7&year>=2000 //professionals
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==7&year>=2000, xtitle("Proportion of Professionals (from LinkedIn)") ytitle("Proportion of Professionals (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==8&year>=2000  //sales
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==8&year>=2000, xtitle("Proportion of Sales Workers (from LinkedIn)") ytitle("Proportion of Sales Workers (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==9&year>=2000 //services
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==9&year>=2000, xtitle("Proportion of Service Workers (from LinkedIn)") ytitle("Proportion of Service Workers (from ACS)") xlabel(,nogrid)

corr prop_acs_occ prop_linkedin_occ if eeo_title_categories==10&year>=2000 //technicians
binscatter prop_acs_occ prop_linkedin_occ if eeo_title_categories==10&year>=2000, xtitle("Proportion of Technicians (from LinkedIn)") ytitle("Proportion of Technicians (from ACS)") xlabel(,nogrid)

}
}

**# EEO-1
{

//firm-year: demographic composition
{
use user_id year employer_id2 using "experience_final_update_0923.dta", clear
rename user_id id2
merge m:1 id2 using "pic_race_gender_1218.dta" //picture-based race (white and black)
drop if _merge==2
drop _merge
save "experience_final_update_0923.dta"

use user_id f_prob api_prob hispanic_prob using "user_id_main.dta", clear //name-based gender and race (asian and hispanic)
rename user_id id2
merge 1:m id2 using  "experience_final_update_0923.dta"
drop if _merge==1
drop _merge
save "experience_final_update_0923.dta",replace

bysort id2 employer_id2 year: gen mark=1 if _n==1
keep if mark==1
foreach var of varlist *mean2{
	bysort employer_id2 year: egen prop_`var'=mean(`var')  //prop of black or white workers for each firm -year group.
}

foreach var of varlist *_prob{
	bysort employer_id2 year: egen prop_`var'=mean(`var') //prop of asian, female, hispanic workers for each firm-year group.
}

save "experience_final_update_0923.dta",replace
keep employer_id2 year prop_*
bysort employer_id2 year: drop if _n!=1
save "/eeo/employer_year_cal.dta",replace //this file contains demographic composition for each industry-year level based on LinkedIn data

use "/eeo/exact_match.dta" //crosswalk between EEO-1 and LinkedIn firms
drop firm_name
merge 1:m employer_id2 using "/eeo/employer_year_cal.dta"
drop if _merge==1
drop _merge
save "/eeo/employer_year_cal.dta",replace //this file contains demographic composition for each industry-year level based on LinkedIn data

bysort p_name year: egen firm_t10=total(total10)
foreach var of varlist blkt10 asiant10 aiant10 hispt10 wht10 ft10 mt10{
	bysort p_name year: egen firm_`var'=total(`var') //group establishment data at the firm level
	local name=subinstr("`var'","t10","",.)
	gen prop_firm_`name'=firm_`var'/firm_t10
	
}
keep eeo_company_id year prop_firm*
bysort eeo_company_id year: drop if _n!=1
merge 1:m eeo_company_id year using "/eeo/employer_year_cal.dta"
keep if _merge==3
drop _merge
gen confidence=total/firm_t10 //confidence score= num. employees in LinkedIn / num. employees in EEO-1 
save "/eeo/match_test.dta"

set scheme cblind1
replace prop_black_mean2=prop_black_mean2/100

binscatter prop_firm_blk prop_black_mean2 if confidence>=0.3&year>=2000, xtitle("Proportion of Black Workers (from LinkedIn)") ytitle("Proportion of Black Workers (from EEOC)") xlabel(,nogrid)
 graph export "/n/holylfs06/LABS/lezhang_lab/Lab/Simeng/odd/data_paper/eeoc_validation_overall_black2.eps", as(eps) name("Graph") preview(off)

replace prop_white_mean2=prop_white_mean2/100
binscatter prop_firm_wh prop_white_mean2 if confidence>=0.3&year>=2000, xtitle("Proportion of White Workers (from LinkedIn)") ytitle("Proportion of White Workers (from EEOC)") xlabel(,nogrid)
graph export "/n/holylfs06/LABS/lezhang_lab/Lab/Simeng/odd/data_paper/eeoc_validation_overall_white2.eps", as(eps) name("Graph") preview(off)
 
 
binscatter prop_firm_f prop_f_prob if confidence>=0.3&year>=2000, xtitle("Proportion of Women Workers (from LinkedIn)") ytitle("Proportion of Women Workers (from EEOC)") xlabel(,nogrid)
graph export "/n/holylfs06/LABS/lezhang_lab/Lab/Simeng/odd/data_paper/eeoc_validation_overall_female2.eps", as(eps) name("Graph") preview(off)
 
binscatter prop_firm_asian prop_api_prob if confidence>=0.3&year>=2000, xtitle("Proportion of Asian Workers (from LinkedIn)") ytitle("Proportion of Asian Workers (from EEOC)") xlabel(,nogrid)
graph export "/n/holylfs06/LABS/lezhang_lab/Lab/Simeng/odd/data_paper/eeoc_validation_overall_asian2.eps", as(eps) name("Graph") preview(off)

binscatter prop_firm_hisp prop_hispanic_prob if confidence>=0.3&year>=2000, xtitle("Proportion of Hispanic Workers (from LinkedIn)") ytitle("Proportion of Hispanic Workers (from EEOC)") xlabel(,nogrid)
graph export "/n/holylfs06/LABS/lezhang_lab/Lab/Simeng/odd/data_paper/eeoc_validation_overall_hispanic2.eps", as(eps) name("Graph") preview(off)
 
}
//firm-year: occupational composition
{
use "experience_final_us_updated_expand9023_soc.dta", clear
drop if missing(eeo_title_categories)
replace eeo_title_categories=3 if eeo_title_categories==4
bysort id2 employer_id2 eeo_title_categories year: gen mark2=1 if _n==1
bysort id2 employer_id2  year: gen mark=1 if _n==1
bysort employer_id2 eeo_title_categories year: egen total_mark2=total(mark2)
bysort employer_id2  year: egen total_mark=total(mark)
keep employer_id2 year eeo_title_categories total_mark total_mark2
bysort employer_id2 eeo_title_categories year: drop if _n!=1
save "/eeo/occ_naics_firm_year.dta" //this file contains demographic composition for each firm-year level based on LinkedIn data

use "/eeo/exact_match.dta" //crosswalk between EEO-1 and LinkedIn firms
drop firm_name 
merge 1:m employer_id2 using "/eeo/occ_naics_firm_year.dta"
drop if _merge==1
drop _merge
save "/eeo/occ_naics_firm_year.dta",replace //merged in EEO-1 firm IDs

use "/eeo/allfirms_consolidated_correct.dta", clear //restrict-use EEO-1 data
foreach num of numlist 1/9{
	bysort p_name year: egen firm_t`num'=total(total`num')
}
bysort p_name year: egen firm_t10=total(total10)
foreach num of numlist 1/9{
	gen prop_firm_t`num'=firm_t`num'/firm_t10
}
keep eeo_company_id year prop_firm_t*
bysort eeo_company_id year: drop if _n!=1
save "/eeo/eeo_occ_firm_year.dta" //this file contains demographic composition for each firm-year level based on EEO-1 data

use "/eeo/occ_naics_firm_year.dta"
gen prop_eeoc=.
replace prop_eeoc=prop_firm_t1 if eeo_title_categories==3    
replace prop_eeoc=prop_firm_t2 if eeo_title_categories==7    
replace prop_eeoc=prop_firm_t3 if eeo_title_categories==10	
replace prop_eeoc=prop_firm_t4 if eeo_title_categories==8   	
replace prop_eeoc=prop_firm_t5 if eeo_title_categories==1	
replace prop_eeoc=prop_firm_t6 if eeo_title_categories==2	
replace prop_eeoc=prop_firm_t7 if eeo_title_categories==6
replace prop_eeoc=prop_firm_t8 if eeo_title_categories==5	
replace prop_eeoc=prop_firm_t9 if eeo_title_categories==9	
gen linkedin_prop=total_mark2/total_mark
save "/eeo/match_occ_firm_year.dta",replace

corr prop_eeoc linkedin_prop if eeo_title_categories==3&confidence>=0.3&year>=2000  //manager
binscatter prop_eeoc linkedin_prop if eeo_title_categories==3&confidence>=0.3&year>=2000, xtitle("Proportion of Managers (from LinkedIn)") ytitle("Proportion of Managers (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==1&confidence>=0.3&year>=2000 //admin support
binscatter prop_eeoc linkedin_prop if eeo_title_categories==1&confidence>=0.3&year>=2000, xtitle("Proportion of Administrative Support Workers  (from LinkedIn)") ytitle("Proportion of Administrative Support Workers  (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==2&confidence>=0.3&year>=2000  //craft
binscatter prop_eeoc linkedin_prop if eeo_title_categories==2&confidence>=0.3&year>=2000, xtitle("Proportion of Craft Workers  (from LinkedIn)") ytitle("Proportion of Craft Workers  (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==5&confidence>=0.3&year>=2000  //labors
binscatter prop_eeoc linkedin_prop if eeo_title_categories==5&confidence>=0.3&year>=2000, xtitle("Proportion of Labors and Helpers (from LinkedIn)") ytitle("Proportion of  Labors and Helpers (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==6&confidence>=0.3&year>=2000  //operatives
binscatter prop_eeoc linkedin_prop if eeo_title_categories==6&confidence>=0.3&year>=2000, xtitle("Proportion of Operatives (from LinkedIn)") ytitle("Proportion of Operatives (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==7&confidence>=0.3&year>=2000  //professionals
binscatter prop_eeoc linkedin_prop if eeo_title_categories==7&confidence>=0.3&year>=2000, xtitle("Proportion of Professionals (from LinkedIn)") ytitle("Proportion of Professionals (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==8&confidence>=0.3&year>=2000  //sales
binscatter prop_eeoc linkedin_prop if eeo_title_categories==8&confidence>=0.3&year>=2000, xtitle("Proportion of Sales Workers (from LinkedIn)") ytitle("Proportion of Sales Workers (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==9&confidence>=0.3&year>=2000  //service
binscatter prop_eeoc linkedin_prop if eeo_title_categories==9&confidence>=0.3&year>=2000, xtitle("Proportion of Service Workers (from LinkedIn)") ytitle("Proportion of Service Workers (from EEOC)") xlabel(,nogrid)

corr prop_eeoc linkedin_prop if eeo_title_categories==10&confidence>=0.3&year>=2000 //technicians
binscatter prop_eeoc linkedin_prop if eeo_title_categories==10&confidence>=0.3&year>=2000, xtitle("Proportion of Technicians (from LinkedIn)") ytitle("Proportion of Technicians (from EEOC)") xlabel(,nogrid)
}



}


