A UDF Example2 in pig

 Problem statement encrypt the sensitive fields in data files using A UDF Example2 in  pig


Go to $PIG_HOME/bin>./pig -x local /home/hadoop/Desktop/DeIdenUDF1.pig

and run it
___________________________________
A java code for UDF . Export to jar file. 

import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;

import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;

import org.apache.commons.codec.binary.Base64;
import org.apache.pig.EvalFunc;
import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;

public class DeIdentifyUDF extends EvalFunc<String> {

    @Override
    public String exec(Tuple input) throws IOException {
         if (input == null || input.size() < 2) {
                warn("invalid number of arguments to DEIDENTIFY", PigWarning.UDF_WARNING_1);
                return null;
            }
            try {
                String plainText = (String)input.get(0);
                String encryptKey = (String)input.get(1);
                String str="";
                try {
                    str = encrypt(plainText,encryptKey.getBytes());
                }  catch (NoSuchPaddingException e) {
                    // TODO Auto-generated catch block
                    str="NoSuchPaddingException";
                    e.printStackTrace();
                } catch (IllegalBlockSizeException e) {
                    // TODO Auto-generated catch block
                    str="IllegalBlockSizeException";
                    e.printStackTrace();
                } catch (BadPaddingException e) {
                    // TODO Auto-generated catch block
                    str="BadPaddingException";
                    e.printStackTrace();
                }
                catch (InvalidKeyException e) {
                    // TODO Auto-generated catch block
                    str="InvalidKeyException";
                    e.printStackTrace();
                } catch (NoSuchAlgorithmException e) {
                    // TODO Auto-generated catch block
                    str="NoSuchAlgorithmException";
                    e.printStackTrace();
                }
                return str;
            } 
            catch (NullPointerException npe) {
                warn(npe.toString(), PigWarning.UDF_WARNING_2);
                return null;
            } catch (StringIndexOutOfBoundsException npe) {
                warn(npe.toString(), PigWarning.UDF_WARNING_3);
                return null;
            } catch (ClassCastException e) {
                warn(e.toString(), PigWarning.UDF_WARNING_4);
                return null;
            }
            
    }
     private String encrypt(String strToEncrypt, byte[] key) throws NoSuchAlgorithmException, NoSuchPaddingException, InvalidKeyException, IllegalBlockSizeException, BadPaddingException 
        {
                Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5Padding");
                SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
                cipher.init(Cipher.ENCRYPT_MODE, secretKey);
                String encryptedString = Base64.encodeBase64String(cipher.doFinal(strToEncrypt.getBytes()));
                System.out.println("------------encryptedString"+encryptedString);
                return encryptedString.trim();
        }
     
     
}_________________________________________________________________________________
A pig script file on desktop DeIdenUDF1.pig

REGISTER /home/hadoop/Desktop/DeIdenUDF1.jar;

A = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset2.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);

/*B = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset1.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);*/

C = UNION A, B;

/*D = FOREACH C GENERATE PatientID, depig.DeIdentifyUDF(Name,'12345678abcdefgh'), depig.DeIdentifyUDF(DOB,'12345678abcdefgh'), depig.DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'),
depig.DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),depig.DeIdentifyUDF(SSN,'12345678abcdefgh'), depig.DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;*/

D = FOREACH C GENERATE PatientID, DeIdentifyUDF(Name,'12345678abcdefgh'), DeIdentifyUDF(DOB,'12345678abcdefgh'), DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'), DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),DeIdentifyUDF(SSN,'12345678abcdefgh'), DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;

STORE D into '/home/hadoop/Desktop/deidentifiedDir';

----------------------------------------------------------------------------------------------------------------------------------------
 A dataset one
______________
11111,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11112,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11113,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11114,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11115,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11116,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11117,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11118,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11119,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11120,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11121,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11122,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11123,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11124,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11125,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11126,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11127,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11128,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11129,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11130,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11131,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11132,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11133,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11134,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11135,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11136,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11137,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11138,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11139,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11140,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11141,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11142,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11143,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11144,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11145,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11146,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
___________
a second data set 

 11111,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11112,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11113,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11114,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11115,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11116,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11117,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11118,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11119,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11120,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11121,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11122,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11123,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11124,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11125,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11126,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11127,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11128,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11129,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11130,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11131,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11132,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11133,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11134,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11135,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11136,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11137,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11138,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11139,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11140,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11141,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11142,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11143,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11144,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11145,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11146,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29

No comments:

Post a Comment