/*******************************************************************************
	
		variable_construction.do
		10.19.2025
		Constructing Dependent Variables for:
		"The Value of Internal Labor Markets: Evidence from LinkedIn Profiles and U.S. Inventors. "
		Letian (LT) Zhang and Simeng Wang
		
*******************************************************************************/

**# PatentsView Variable Construction
	**# unadjusted three-year forward citation at patent level
{
use  "/patentview data/g_patent_citation.dta"
** merge in citing_patent_id's grant year
rename citing_patent_id patent_id 
merge m:1 patent_id using "/patentview data/g_patent.dta", keepusing(patent_id grant_year)
drop if _merge==2
drop _merge
rename patent_id citing_patent_id
rename grant_year citing_patent_grant_year
keep *patent_id citation_year citing_patent_grant_year
rename citation_year cited_patent_grant_year
duplicates drop
gen three_year=1 if citing_patent_grant_year>=cited_patent_grant_year & citing_patent_grant_year<=cited_patent_grant_year+3 & !missing(cited_patent_grant_year)&!missing(citing_patent_grant_year)
bysort cited_patent_id three_year: gen three_year_citation=_N if three_year==1
keep cited_patent_id three_year_citation cited_patent_grant_year
gsort cited_patent_id -three_year_citation 
by cited_patent_id: replace three_year_citation=three_year_citation[1]
duplicates drop
save "/patentview variable/three_year_citation.dta"

**we found that in the citation file, there are some typos on the cited patent id's grant year so some cited_patent_id may have multiple grant years. To address this issue, we merge in grant year from patent file. 
use "/patentview data/g_patent.dta"
keep patent_id grant_year
rename patent_id cited_patent_id
merge 1:m cited_patent_id using "/patentview variable/three_year_citation.dta"
drop if _merge==1
drop if _merge==3&grant_year!=cited_patent_grant_year&count>1
drop grant_year _merge
save "/patentview variable/three_year_citation.dta",replace
}
	**# identify each patent's primary cpc category (technology class)
{
use "/patentview data/g_cpc_current.dta",clear
bysort patent_id: egen min_sequence=min(cpc_sequence)
gen primary_cpc_class=cpc_class if cpc_sequence==min_sequence
keep if cpc_sequence==min_sequence
keep primary_cpc_class patent_id
save "/patentview variable/primary_cpc_class.dta"
}
	**# technology class-year adjusted three-year forward citation
{
use "/patentview variable/primary_cpc_class.dta",clear
rename patent_id cited_patent_id
tostring cited_patent_id,replace
merge 1:m cited_patent_id using "/patentview variable/three_year_citation.dta"
drop if _merge==1
drop _merge
replace primary_cpc_class=99999 if missing(primary_cpc_class) //15% of patents granted during our sample period (2000-2025) have missing cpc class. 
replace three_year_citation=0 if missing(three_year_citation)
bysort primary_cpc_class cited_patent_grant_year: egen total_citation=total(three_year_citation)
bysort primary_cpc_class cited_patent_grant_year: gen total_patent=_N
gen mean_techclass_year=(total_citation-three_year_citation)/(total_patent-1)
gen weighted_three_year_citation=three_year_citation/mean_techclass_year
replace weighted_three_year_citation=three_year_citation if total_patent==1 
**if the focal patent is the only patent in the technology class-year group, then the weighted should be its unadjusted three-year forward citation
replace weighted_three_year_citation=three_year_citation if total_citation-three_year_citation==0
**if the focal patent is the only patent in the technology class-year group that has any citations, then the weighted should be its unadjusted three-year forward citation

save "/patentview variable/three_year_citation.dta",replace

}
	**# co-authored adjusted three-year forward citation
{
use "/patentview data/g_inventor.dta", clear
bysort patent_id: gen num_inventors=_N
keep patent_id num_inventors
bysort patent_id: drop if _n!=1
save "/patentview variable/num_inventors_patent_id.dta"
rename patent_id cited_patent_id
merge 1:m cited_patent_id using "/patentview variable/three_year_citation.dta"
drop if _merge==1
drop _merge
gen coauthor_wt_three_year_citation=weighted_three_year_citation/num_inventors
**we found that a few patent_id have multiple grant year (less than 3% of patents), likely due to some errors during Patentsview's disambiguation process. For these cases, we kept the latest grant year.  
bysort cited_patent_id: egen max_grant_year=max(cited_patent_grant_year)
keep if cited_patent_grant_year==max_grant_year
cap drop count
bysort cited_patent_id: gen count=_N //to assert one patent only appears once 
save "/patentview variable/three_year_citation.dta",replace

}
	**# num. applications 
{
use "/patentview data/pg_published_application.dta"

keep inventor_id filing_date_year pgpub_id
duplicates drop
bysort inventor_id filing_date_year: gen application_count_iy=_N
keep inventor_id filing_date_year application_count_iy 
drop if missing(filing_date_year)
duplicates drop 
save "/patentview data/application_count_iy.dta"
}

	**# patent value 
{
import delimited using "/patentsview data/KPSS_2023.csv"
rename patent_num patent_id
keep patent_id xi_nominal xi_real 
duplicates drop
tostring patent_id, replace 
merge 1:m patent_id using "/patentview variable/num_inventors_patent_id.dta"
keep if _merge==3
drop _merge
merge 1:m patent_id using "/patentview data/g_inventor.dta"
keep if _merge==3
keep patent_id xi_* inventor_id 
save "/patentview data/patent_value.dta"

joinby patent_id using "/patentview data/g_assignee.dta"
drop _merge
keep patent_id xi_* inventor_id assignee_id
save "/patentview variable/patent_value.dta"

use "/firm_crosswalk/assignee_id_name.dta"
keep assignee_id assignee_firm_id3
duplicates drop
merge 1:m assignee_id using "/patentview variable/patent_value.dta"
drop if _merge==1
drop _merge
save "/patentview variable/patent_value.dta",replace

use "/patentview data/g_application.dta"
keep patent_id application_year
duplicates drop 
rename application_year year
merge 1:m patent_id using  "/patentview variable/patent_value.dta"
drop if _merge==1
drop _merge
save  "/patentview variable/patent_value.dta",replace
 
merge m:1 inventor_id using "/individual crosswalk/inventor_id_name.dta",keepusing(inventor_id2) 
//inventor_id in patentsview raw data is string, and this file contains the numeric version to save space
drop if _merge==2
drop _merge
save  "/patentview variable/patent_value.dta",replace
bysort inventor_id2 year assignee_firm_id3: egen total_patent_value_real=total(xi_real)
bysort inventor_id2 year assignee_firm_id3: egen mean_patent_value_real=mean(xi_real)
keep inventor_id2 year assignee_firm_id3 total_patent_value_real mean_patent_value_real
duplicates drop
save  "/patentview variable/inventor_year_assignee_patent_value.dta",replace
}
	**# individual-assignee-application year level measures
{
use "/patentview data/g_application.dta",clear
keep patent_id application_year
duplicates drop
merge 1:m patent_id using "/patentview data/g_inventor.dta",keepusing(patent_id inventor_id)
drop if _merge!=3 
//810 patents have no inventor information and 3926 patents have no application year information
drop _merge
save "/patentview variable/individual_assignee_year_level_measure.dta"

use "/patentview data/g_assignee.dta",clear
bysort patent_id: gen count_assignee=_N 
//94% of patents have one unique organization assignee. For those 6% of patents, there are multiple assignees, which means we cannot tell which assignee is the employer assignee. To address this, we keep all possible inventor-assignee combinations, and those false matches would not be matched with an linkedin employment record based on our required criteria in the matching process. 
keep if !missing(disambig_assignee_organization) 
keep patent_id assignee_id
duplicates drop
drop if missing(patent_id)|missing(assignee_id)
recast str assignee_id
merge m:1 assignee_id using "/firm_crosswalk/assignee_id_name.dta",keepusing(assignee_id assignee_firm_id3)
drop if _merge==2
drop _merge 
keep patent_id assignee_firm_id3
duplicates drop
joinby patent_id using "/patentview variable/individual_assignee_year_level_measure.dta", unmatched(both)
keep if _merge==3 
//only keep patent_id that have both inventor and organization assignee information available. 
drop _merge
save "/patentview variable/individual_assignee_year_level_measure.dta",replace

use "/patentview variable/three_year_citation.dta"
rename cited_patent_id patent_id
keep patent_id three_year_citation weighted_three_year_citation coauthor_wt_three_year_citation
merge 1:m patent_id using "/patentview variable/individual_assignee_year_level_measure.dta"
drop if _merge==1
foreach var of varlist three_year_citation weighted_three_year_citation coauthor_wt_three_year_citation{
	replace `var'=0 if _merge==2 //do not have any citations (hence not included in the citation files.)
}
drop _merge
save "/patentview variable/individual_assignee_year_level_measure.dta",replace

bysort inventor_id assignee_firm_id3 application_year: gen patent_count=_N
bysort inventor_id assignee_firm_id3 application_year: egen total_three_year_citation=total(three_year_citation)
bysort inventor_id assignee_firm_id3 application_year: egen total_wt_three_year_citation=total(weighted_three_year_citation)
bysort inventor_id assignee_firm_id3 application_year: egen total_cwt_three_year_citation=total(coauthor_wt_three_year_citation)
keep inventor_id assignee_firm_id3 application_year patent_count total_three_year_citation total_wt_three_year_citation total_cwt_three_year_citation
duplicates drop
save "/patentview variable/individual_assignee_year_level_measure.dta",replace
}
