Problem statement encrypt the sensitive fields in data files using A UDF Example2 in pig
Go to $PIG_HOME/bin>./pig -x local /home/hadoop/Desktop/DeIdenUDF1.pig
and run it
___________________________________
A java code for UDF . Export to jar file.
import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;
import org.apache.commons.codec.binary.Base64;
import org.apache.pig.EvalFunc;
import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;
public class DeIdentifyUDF extends EvalFunc<String> {
@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() < 2) {
warn("invalid number of arguments to DEIDENTIFY", PigWarning.UDF_WARNING_1);
return null;
}
try {
String plainText = (String)input.get(0);
String encryptKey = (String)input.get(1);
String str="";
try {
str = encrypt(plainText,encryptKey.getBytes());
} catch (NoSuchPaddingException e) {
// TODO Auto-generated catch block
str="NoSuchPaddingException";
e.printStackTrace();
} catch (IllegalBlockSizeException e) {
// TODO Auto-generated catch block
str="IllegalBlockSizeException";
e.printStackTrace();
} catch (BadPaddingException e) {
// TODO Auto-generated catch block
str="BadPaddingException";
e.printStackTrace();
}
catch (InvalidKeyException e) {
// TODO Auto-generated catch block
str="InvalidKeyException";
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
str="NoSuchAlgorithmException";
e.printStackTrace();
}
return str;
}
catch (NullPointerException npe) {
warn(npe.toString(), PigWarning.UDF_WARNING_2);
return null;
} catch (StringIndexOutOfBoundsException npe) {
warn(npe.toString(), PigWarning.UDF_WARNING_3);
return null;
} catch (ClassCastException e) {
warn(e.toString(), PigWarning.UDF_WARNING_4);
return null;
}
}
private String encrypt(String strToEncrypt, byte[] key) throws NoSuchAlgorithmException, NoSuchPaddingException, InvalidKeyException, IllegalBlockSizeException, BadPaddingException
{
Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5Padding");
SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
cipher.init(Cipher.ENCRYPT_MODE, secretKey);
String encryptedString = Base64.encodeBase64String(cipher.doFinal(strToEncrypt.getBytes()));
System.out.println("------------encryptedString"+encryptedString);
return encryptedString.trim();
}
}_________________________________________________________________________________
A pig script file on desktop DeIdenUDF1.pig
REGISTER /home/hadoop/Desktop/DeIdenUDF1.jar;
A = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset2.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);
/*B = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset1.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);*/
C = UNION A, B;
/*D = FOREACH C GENERATE PatientID, depig.DeIdentifyUDF(Name,'12345678abcdefgh'), depig.DeIdentifyUDF(DOB,'12345678abcdefgh'), depig.DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'),
depig.DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),depig.DeIdentifyUDF(SSN,'12345678abcdefgh'), depig.DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;*/
D = FOREACH C GENERATE PatientID, DeIdentifyUDF(Name,'12345678abcdefgh'), DeIdentifyUDF(DOB,'12345678abcdefgh'), DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'), DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),DeIdentifyUDF(SSN,'12345678abcdefgh'), DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;
STORE D into '/home/hadoop/Desktop/deidentifiedDir';
----------------------------------------------------------------------------------------------------------------------------------------
A dataset one
______________
11111,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11112,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11113,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11114,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11115,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11116,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11117,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11118,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11119,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11120,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11121,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11122,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11123,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11124,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11125,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11126,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11127,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11128,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11129,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11130,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11131,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11132,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11133,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11134,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11135,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11136,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11137,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11138,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11139,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11140,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11141,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11142,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11143,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11144,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11145,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11146,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
___________
a second data set
11111,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11112,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11113,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11114,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11115,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11116,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11117,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11118,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11119,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11120,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11121,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11122,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11123,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11124,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11125,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11126,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11127,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11128,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11129,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11130,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11131,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11132,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11133,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11134,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11135,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11136,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11137,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11138,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11139,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11140,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11141,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11142,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11143,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11144,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11145,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11146,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
No comments:
Post a Comment