cap program drop classifyNeighbors
program define classifyNeighbors
args coordinatesFile ///
longitudeVar ///
latitudeVar ///
coordinatesKEEP_STRINGS ///
distanceFile ///
distanceUnit ///
distanceVar ///
distanceKEEP_STRINGS ///
areaFile ///
areaUnit ///
areaVar ///
areaKEEP_STRINGS ///
neighbroingLinesFile ///
rootDir ///
PATH_GIS ///
geoid ///
idVar ///
maxNeighborDegree
***********************IMPORTANT NOTE ABOUT FILES***************************
* The files used for the coordinates, area, distance, and neighboring lines
* are allowed to be different files, although they do not have to be, but it
* is crucial that they all are related to some parent file such that the
* variable "objectid" which is assigned by ArcGIS is the same for all the
* spatial units. This means that you cannot use a file with the coordinates
* of the centroids of the spatial units that originated from a different
* shapefile, and clearly not from a file that does not even contain the
* "objectid" variable.
****************************************************************************
* Arguments Legend
* coordinatesFile: file which stores the coordinates of each centroid per
* each spatial unit.
* longitudeVar: variable with the longitude coordinate
* latitudeVar: variable with the latitude coordinate
* distanceFile: The file which stores the point distance, centroid to
* centroid, between your spatial units.
* distanceUnit: meters, km, feet, miles, etc.
* distanceVar: variable name for the distance variable
* areaFile: The file stores the total area of your spatial unit. Could be
* the same file as the distanceFile, but ideally you want to use different
* projections (distance preserving, area preserving, etc.) to reduce the
* error in each variable.
* areaUnit: squared km, squared feet, etc.
* areaVar: variable name for the area variable
* neighbroingLinesFile: This file stores the data from the XXXXXX tool, from
* ArcGIS.
* rootDir: path to your library.
* geoid: variable that identifies the spatial unit in ArcGIS
* idVar: variable that identifies the spatial unit from hereafter
* maxNeighborDegree: how many neighboring degrees should be classified. By
* choosing 1 the program will classify only the first degree neighbors, by
* choosing 2 it will also classify the neighbor of the neighbor (2nd degree
* neighbor), by choosing 3 it will classify the neighbor of the neighbor of
* the neighbor (3rd degree neighbor), and so on.
* CAUTION: Choosing higher maximum degrees results in very long runtimes.
* This program is not very efficient and could take a lot of time (and
* require a lot of RAM) to process high degree of neighboring relationship.
tempfile coordinatesTable ///
distanceTable ///
areaTable ///
neighboringCounties
* Get coordinates for each county centroid
import delimited "`rootDir'/`PATH_GIS'/`coordinatesFile'", ///
clear stringcols(`coordinatesKEEP_STRINGS')
rename `geoid' `idVar'
rename objectid gisID
keep gisID `idVar' `longitudeVar' `latitudeVar'
rename `longitudeVar' longitude
rename `latitudeVar' latitude
sort gisID
save `coordinatesTable', replace
* Import the distance matrix between county centroids
* Use the stringcols option to preserve leading zeros in your id variable
import delimited "`rootDir'/`PATH_GIS'/`distanceFile'", ///
clear stringcols(`distanceKEEP_STRINGS')
rename `geoid' `idVar'Prime
rename `geoid'_1 `idVar'
* This would be the place to convert, if needed, the distance units.
label var `distanceVar' "Distance (in `distanceUnit')"
keep `idVar'Prime `idVar' `distanceVar'
sort `idVar'Prime `idVar'
save `distanceTable', replace
* Import the area of each spatial unit
import delimited "`rootDir'/`PATH_GIS'/`areaFile'", ///
clear stringcols(`areaKEEP_STRINGS')
rename `geoid' `idVar'
keep `idVar' `areaVar'
label var `areaVar' "Area (in `areaUnit')"
sort `idVar'
save `areaTable', replace
* Now that we have the distance between centroids, and the area of each
* spatial unit stored in memory we can continue to classify the neighboring
* relationship between the spatial units. We need to prepare one more file.
* Import the information about the neighboring lines, the lines that identify
* the spatial units that share a border.
import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear
keep objectid left_fid right_fid
* Drop the lines that are on the edge, meaning that there is no further spatial
* unit beyond that line.
drop if left_fid < 0 | right_fid < 0
* No point in having duplicate line sets that identify the same neighboring
* spatial unit so we keep only the unique combinations.
bysort left_fid right_fid: keep if _n == 1
rename left_fid gisID
rename right_fid neighborID
* We now have the the neighbors to the right of each spatial unit
sort gisID
save `neighboringCounties', replace
* Repeating to find the neighbors on the left of each spatial unit
import delimited "`rootDir'/`PATH_GIS'/`neighbroingLinesFile'", clear
keep objectid left_fid right_fid
drop if left_fid < 0 | right_fid < 0
bysort left_fid right_fid: keep if _n == 1
rename right_fid gisID
rename left_fid neighborID
* We now have the neighbors to the left of each spatial unit
sort gisID
* Combine the neighbors to the left and to the right of each spatial unit by
* appending the previous file saved.
append using `neighboringCounties'
keep gisID neighborID
save `neighboringCounties', replace
* We now have all the needed files ready and we can start the classification.
use `coordinatesTable', clear
sort `idVar'
merge 1:1 `idVar' using `areaTable'
keep if _merge == 3
drop _merge
rename `idVar' `idVar'Prime
rename longitude longitudePrime
rename latitude latitudePrime
rename `areaVar' areaPrime
* Each observation is a spatial unit that serves as an anchor, denoted as PRIME.
* Now we start identifying the neighboring units of each PRIME unit.
* Match spatial units based on their GIS objectid numbers
merge 1:m gisID using `neighboringCounties'
keep if _merge == 3
drop _merge
* This step just gave us the matching between each unique prime spatial unit
* and all its first degree neighbors. Next we want to add the data about the
* coordinates of the first degree neighbors and the distance between the prime
* units and the first degree neighbors.
* Flag the neighborID as the GIS objectid number (gisID)
drop gisID
rename neighborID gisID
sort gisID
* Match the gisID with the idVar and geolocation data
merge m:1 gisID using `coordinatesTable'
keep if _merge == 3
drop _merge
* Get the distance data
sort `idVar'Prime `idVar'
merge 1:1 `idVar'Prime `idVar' using `distanceTable'
keep if _merge == 3
drop _merge
* Get the area data
sort `idVar'
merge m:1 `idVar' using `areaTable'
keep if _merge == 3
drop _merge
* Rename
rename `idVar' `idVar'Neighbor1
rename longitude longitudeNeighbor1
rename latitude latitudeNeighbor1
rename `distanceVar' distanceNeighbor1
rename `areaVar' areaNeighbor1
* Next we are going to repeat the above process for second degree neighbors and
* so on, up to the neighboring degree determined by the maxNeighborDegree
* parameter.
* This will simply be used to order the variables nicely at the end.
local sortOrder = ""
forvalues degreeN = 2(1)`maxNeighborDegree' {
sort gisID
joinby gisID using `neighboringCounties'
drop gisID
rename neighborID gisID
sort gisID
merge m:1 gisID using `coordinatesTable'
keep if _merge == 3
drop _merge
sort `idVar'Prime `idVar'
merge m:1 `idVar'Prime `idVar' using `distanceTable'
drop if _merge == 2
drop _merge
sort `idVar'
merge m:1 `idVar' using `areaTable'
keep if _merge == 3
drop _merge
rename `idVar' `idVar'Neighbor`degreeN'
rename longitude longitudeNeighbor`degreeN'
rename latitude latitudeNeighbor`degreeN'
rename `distanceVar' distanceNeighbor`degreeN'
rename `areaVar' areaNeighbor`degreeN'
local sortOrder = "`sortOrder' " + "`idVar'Neighbor`degreeN'"
}
sort `idVar'Prime `idVar'Neighbor1 `sortOrder'
* Removing duplicates.
* This process takes the most amount of time relative to the other pieces and
* this is also the part that does not scale well, meaning that computational
* time increases exponentially with the number of max neighbor degree.
* Counties should not be neighbors of themselves
forvalues degreeN = 2(1)`maxNeighborDegree' {
drop if `idVar'Prime == `idVar'Neighbor`degreeN'
}
* Neighboring counties should only be counted once
local maxNeighborDegreeMinus1 = `maxNeighborDegree' -1
qui levelsof `idVar'Prime, local(`idVar'Levels)
* The following two locals will help generate a progress bar
local idN: word count "``idVar'Levels'"
local k = 1
foreach idPrime in ``idVar'Levels' {
forvalues degreeN = 1(1)`maxNeighborDegreeMinus1' {
qui levelsof `idVar'Neighbor`degreeN' if `idVar'Prime == "`idPrime'", local(`idVar'N`degreeN')
local degreeN_Plus_1 = `degreeN' + 1
foreach idCodeN`degreeN' in ``idVar'N`degreeN'' {
qui drop if `idVar'Neighbor`degreeN_Plus_1' == "`idCodeN`degreeN''" ///
& `idVar'Prime == "`idPrime'"
}
}
* I included a progress so there will be some meaningful output while this runs
local PROGRESS = "Progress (Neighbor): " + string(round(`k'/`idN'*100,0.01)) + "%"
di "`PROGRESS'"
local k = `k' + 1
}
drop gisID
* Each spatial unit, identified by `idVar'Prime, has its neighbors of degree 1,
* identified by `idVar'Neighbor1, and its neighbors of degree 2, identified by
* `idVar'Neighbor2; and so on.
*
* Notice that the number of observations per-spatial unit is determined by the
* number of unique spatial units in the the highest neighboring degree, meaning
* there are some duplicates left in the lower levels.
* This part makes it easier to merge the resulting file based on `idVar'
gen `idVar' = `idVar'Prime
sort `idVar'
save `neighboringCounties', replace
end