Creating a Table in HBase from a Java program with bulk load ( with Table varaibles as Run Configurations)

Problem Statement : Create a HBase table by giving the specifications as run configuration to the program and entire data file by accessing it.


__________________________________________________
create a java program in any IDE as HBaseCreateLoad.java(code below)


import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
//import java.util.ArrayList;
//import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
//import org.apache.hadoop.hbase.client.Delete;
//import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;


public class HBaseCreateLoad {

        /**
             * @param args
         */
   
    private static Configuration conf = null;
   
              /**
                   * Initialization
             */

   static {
            conf = HBaseConfiguration.create();
         // Passing the HBase configurations to the program
          }
     
         /**
          * Create a table
        */

    public static void creatTable(String tableName, String[] familys)
    throws Exception
 {
          HBaseAdmin admin = new HBaseAdmin(conf);
         if (admin.tableExists(tableName)) {
               System.out.println("table already exists!");
          }
          else {
                   @SuppressWarnings("deprecation")
               HTableDescriptor tableDesc = new HTableDescriptor(tableName);
                     for (int i = 0; i < familys.length; i++) {
                             tableDesc.addFamily(new HColumnDescriptor(familys[i]));
                       }
         admin.createTable(tableDesc);
         System.out.println("create table " + tableName + " ok.");
             }
    }
  
    public static void deleteTable(String tableName) throws Exception {
    try {
          HBaseAdmin admin = new HBaseAdmin(conf);
         admin.disableTable(tableName);
           admin.deleteTable(tableName);
          System.out.println("delete table " + tableName + " ok.");
         }
     catch (MasterNotRunningException e)
             {
                    e.printStackTrace();
             }
    catch (ZooKeeperConnectionException e) {
                e.printStackTrace();
           }
    }

   /**
    * Put (or insert) a row
    */

    public static void addRecords(String tableName, String[] cFamily,String fp,String[] cols) throws Exception {
 
    //Here fp is location url to file to load , tableName is the name of the table
    // cFamily is the string array to notify how many column families should be in table
    //cols is the string array carrying names of column qualifiers that goes with column family
            try {
                    //reading comma seperated file
                      String csvfile = fp.toString();
                      BufferedReader br = new BufferedReader(new FileReader(csvfile));
                      String line;
                       int row= 0;                
                      //Accessing the table in hbase
                       HTable table = new HTable(conf,tableName);
                       Put p ;                  
                
                 while((line = br.readLine())!=null){
                     int i =0;
                     row++;
                     String value[] = line.split(",");
                     String rowid = Integer.toString(row);
                    // int colNum = cols.length;
                  
                     // here we are adding the value[] values from the csv file we read
                     //cFamily[0],cFamily[1] which are basically passed in run configurations of the program run
                     //cFamily[0],cFamily[1] would be "exName" "stkNm" as per requirement
                     //cols[] is the column qualifiers names as per requirement passed in program execution
                     //cols[] names are "exnm" "tkr" "tdate" "opr" "hpr" "lpr" "cpr" "svol" "pravg"
                     //As per problem statement the first data values in csv file goes into first column family of
                      //"exName" with column qualifier "exnm" while the remaining in column family "stkNm"

                     p = new Put(Bytes.toBytes(rowid));                   
                     p.add(Bytes.toBytes(cFamily[0].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[0]));
                  
                    //increment of column qualifier position every time after adding the current one in varaible i
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[1]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[2]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[3]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[4]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[5]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[6]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[7]));
                     i=i+1;
                     p.add(Bytes.toBytes(cFamily[1].toString()), Bytes.toBytes(cols[i]),Bytes.toBytes(value[8]));
                     table.put(p);
                   
                     //System.out.println("Value : "+ value[8]);
                 }        
              
                /*HTable table = new HTable(conf, tableName);
                           Put put = new Put(Bytes.toBytes(rowKey));
                     put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes
                          .toBytes(value));
                    table.put(put);
                   System.out.println("insert recored " + rowKey + " to table "
                  + tableName + " ok."); */

                 } catch (IOException e) {
                        e.printStackTrace();
                  }
            }
  
          
    @SuppressWarnings("deprecation")
    public static void getAllRecord (String tableName) {
        try{
               HTable table = new HTable(conf, tableName);
                 Scan s = new Scan();
                ResultScanner ss = table.getScanner(s);
                for(Result r :ss){
                         for(KeyValue kv : r.raw()){
                                 System.out.print(new String(kv.getRow()) + " ");
                                 System.out.print(new String(kv.getFamily()) + ":");
                                System.out.print(new String(kv.getQualifier()) + " ");
                               System.out.print(kv.getTimestamp() + " ");
                              System.out.println(new String(kv.getValue()));
                          }
              }
        } catch (IOException e){
                 e.printStackTrace();
               }
    }  
   
    public static void main(String[] args) {
        // TODO Auto-generated method stub
       //A String of ARGS is passed in the main method as per our problem statement
      // args[] contains the data "stocktable"  2 "exName" "stkNm"  "/home/hadoop/Desktop/nyse" "exnm"   
        //"tkr" "tdate" "opr" "hpr" "lpr" "cpr" "svol" "pravg"

         try {
             //access the table name from args in position 0
               String tablename = args[0].toString();
              //Access the total number of column family  must be in the table which is identified by a number in  args position number 1 and relating their names 
                String[] familys =new String[Integer.parseInt(args[1])];
                for(int i=0;(i < Integer.parseInt(args[1]));i++ ){
                int temp1 =(2+i);
                familys[i] = args[temp1].toString();
                }
            //create a table with table name and column family information by accessing ceatTable() function 
            HBaseCreateLoad.creatTable(tablename, familys);

             // To find the file csv path url in args[] position of 4
              int fpath = ((Integer.parseInt(args[1]))+2);
              System.out.println("fpath variable postion value " +fpath);

            // Retrieving the names of the columns qualifier and their starting position in args[]
            int colNames =fpath+1;
            System.out.println("column  variables postion start value " +colNames);
            int argsLen =(args.length-1);
            String[] colTemp = new String[argsLen-fpath];
            int cVar1=0;
               while(colNames<=argsLen){
                       colTemp[cVar1]= args[colNames].toString();
                        System.out.println("column  variables value " +colTemp[cVar1]);
                       cVar1=cVar1+1;
                       colNames =colNames+1;
                       }
              //add entire file
                System.out.println("fpath variable value " + args[fpath].toString());
               HBaseCreateLoad.addRecords(tablename,familys,args[fpath],colTemp);
             
               System.out.println("===========show all record========");
               HBaseCreateLoad.getAllRecord(tablename);
            } catch (Exception e) {
                    e.printStackTrace();
                    }
            }
   
    }
________________________________________________________________
Run the HBaseCreateLoad.java with Run configurations shown below.

"stocktable"  2 "exName" "stkNm"  "/home/hadoop/Desktop/nyse" "exnm" "tkr" "tdate" "opr" "hpr" "lpr" "cpr" "svol" "pravg"
------
Note:(By analyzing the data in data file "nyse") see above

1)Given table name as "stocktable"
2)configured the entire data in two column families , (exName, stkNm)
3)the data file location (/home/hadoop/Desktop/nyse)
4) the column Qualifiers give as("exnm" "tkr" "tdate" "opr" "hpr" "lpr" "cpr" "svol" "pravg")

_______________________________________________________
Once the program is executed .Go to $HBASE_HOME/bin>hbase shell
Make sure the HBase is running .
In Hbase shell
hbase>list
hbase>scan 'stocktable'
___________________________________________________________________
The data file used:
file named nyse (on Desktop)

NYSE,QTM,08-02-2010,2.37,2.42,2.29,2.36,3013600,2.36
NYSE,QTM,05-02-2010,2.38,2.5,2.34,2.41,2687600,2.41
NYSE,QTM,04-02-2010,2.57,2.64,2.39,2.46,4529800,2.46
NYSE,QTM,03-02-2010,2.64,2.67,2.55,2.63,2688600,2.63
NYSE,QTM,02-02-2010,2.69,2.76,2.56,2.66,2959700,2.66
NYSE,QTM,01-02-2010,2.6,2.8,2.52,2.67,5050100,2.67
NYSE,QTM,29-01-2010,2.63,2.73,2.26,2.56,16484000,2.56
NYSE,QTM,28-01-2010,3.09,3.09,2.95,3.06,3986400,3.06
NYSE,QTM,27-01-2010,3.03,3.1,2.99,3.03,2431900,3.03
NYSE,QTM,26-01-2010,3.07,3.18,3,3.03,4027600,3.03
NYSE,QTM,25-01-2010,2.94,3.07,2.93,3.03,2285400,3.03
NYSE,QTM,22-01-2010,2.94,3.1,2.89,2.9,2986700,2.9
NYSE,QTM,21-01-2010,2.94,3.11,2.92,2.95,4547800,2.95
NYSE,QTM,20-01-2010,2.94,2.97,2.88,2.93,1883900,2.93
NYSE,QTM,19-01-2010,2.89,2.94,2.88,2.94,2089700,2.94
NYSE,QTM,15-01-2010,2.86,2.96,2.85,2.88,2908000,2.88
NYSE,QTM,14-01-2010,2.91,2.98,2.88,2.92,3425500,2.92
NYSE,QTM,13-01-2010,2.69,2.93,2.61,2.91,4795700,2.91
NYSE,QTM,12-01-2010,2.98,2.99,2.18,2.65,8417400,2.65
NYSE,QTM,11-01-2010,3.03,3.03,2.99,3,2082400,3
NYSE,QTM,08-01-2010,2.96,3.07,2.95,2.99,2863600,2.99
NYSE,QTM,07-01-2010,2.88,2.97,2.86,2.96,2580700,2.96
NYSE,QTM,06-01-2010,2.87,2.98,2.85,2.89,2425200,2.89
NYSE,QTM,05-01-2010,3.06,3.1,2.83,2.97,4879600,2.97
NYSE,QTM,04-01-2010,3.1,3.1,2.98,3,5793500,3
NYSE,QTM,31-12-2009,2.96,3,2.92,2.93,1422900,2.93
NYSE,QTM,30-12-2009,2.96,3.04,2.95,2.97,1447900,2.97
NYSE,QTM,29-12-2009,3.05,3.08,2.95,2.98,1763000,2.98
NYSE,QTM,28-12-2009,3.08,3.15,3.03,3.05,2873700,3.05
NYSE,QTM,24-12-2009,2.98,3.04,2.96,3.03,1442300,3.03
NYSE,QTM,23-12-2009,2.95,2.99,2.92,2.97,1823300,2.97
NYSE,QTM,22-12-2009,2.93,2.95,2.9,2.92,1926200,2.92
NYSE,QTM,21-12-2009,2.8,2.92,2.76,2.9,2660100,2.9
NYSE,QTM,18-12-2009,2.89,2.94,2.76,2.76,3187100,2.76
NYSE,QTM,17-12-2009,2.93,2.94,2.85,2.85,1850300,2.85
NYSE,QTM,16-12-2009,2.95,2.95,2.83,2.86,1530500,2.86
NYSE,QTM,15-12-2009,2.82,2.91,2.7,2.88,1574900,2.88
NYSE,QTM,14-12-2009,2.91,2.94,2.87,2.88,1673600,2.88
NYSE,QTM,11-12-2009,2.85,2.94,2.85,2.91,2095100,2.91
NYSE,QTM,10-12-2009,2.81,2.88,2.81,2.87,2141900,2.87
NYSE,QTM,09-12-2009,2.67,2.82,2.65,2.81,2275600,2.81
NYSE,QTM,08-12-2009,2.66,2.79,2.65,2.74,1872800,2.74
NYSE,QTM,07-12-2009,2.72,2.82,2.72,2.78,2544900,2.78
NYSE,QTM,04-12-2009,2.71,2.74,2.63,2.7,1939700,2.7
NYSE,QTM,03-12-2009,2.66,2.71,2.62,2.63,2376900,2.63
NYSE,QTM,02-12-2009,2.6,2.65,2.56,2.62,2133700,2.62
NYSE,QTM,01-12-2009,2.48,2.56,2.42,2.54,4108700,2.54
NYSE,QTM,30-11-2009,2.43,2.44,2.34,2.44,4220700,2.44
NYSE,QTM,27-11-2009,2.33,2.39,2.29,2.34,1254000,2.34
NYSE,QTM,25-11-2009,2.49,2.51,2.41,2.42,1540400,2.42
NYSE,QTM,24-11-2009,2.52,2.55,2.37,2.5,1861800,2.5
NYSE,QTM,23-11-2009,2.55,2.62,2.51,2.53,1701800,2.53
NYSE,QTM,20-11-2009,2.55,2.58,2.5,2.51,1137200,2.51
NYSE,QTM,19-11-2009,2.65,2.67,2.5,2.57,2395200,2.57
NYSE,QTM,18-11-2009,2.78,2.78,2.63,2.66,2107100,2.66
NYSE,QTM,17-11-2009,2.63,2.72,2.58,2.68,3477400,2.68
NYSE,QTM,16-11-2009,2.5,2.61,2.47,2.6,3787700,2.6
NYSE,QTM,13-11-2009,2.47,2.48,2.39,2.45,1478500,2.45
NYSE,QTM,12-11-2009,2.42,2.48,2.42,2.47,2417200,2.47
NYSE,QTM,11-11-2009,2.53,2.53,2.4,2.43,1678000,2.43
NYSE,QTM,10-11-2009,2.54,2.54,2.26,2.4,3190800,2.4
NYSE,QTM,09-11-2009,2.36,2.5,2.36,2.45,2284700,2.45
NYSE,QTM,06-11-2009,2.4,2.5,2.4,2.44,1877000,2.44
NYSE,QTM,05-11-2009,2.54,2.56,2.41,2.46,4835000,2.46
NYSE,QTM,04-11-2009,2.07,2.64,2.05,2.35,8634300,2.35
NYSE,QTM,03-11-2009,1.77,2.05,1.77,2.05,3125400,2.05
NYSE,QTM,02-11-2009,1.82,1.88,1.72,1.79,3163600,1.79
NYSE,QTM,30-10-2009,1.91,1.92,1.81,1.85,2931000,1.85
NYSE,QTM,29-10-2009,2.01,2.02,1.81,1.91,4891300,1.91
NYSE,QTM,28-10-2009,1.8,2.04,1.8,1.98,13317100,1.98
NYSE,QTM,27-10-2009,1.54,1.73,1.53,1.63,3051700,1.63
NYSE,QTM,26-10-2009,1.51,1.57,1.5,1.53,1754700,1.53
NYSE,QTM,23-10-2009,1.54,1.55,1.47,1.51,2100100,1.51
NYSE,QTM,22-10-2009,1.47,1.55,1.46,1.53,2165800,1.53
NYSE,QTM,21-10-2009,1.36,1.6,1.34,1.47,4227400,1.47
NYSE,QTM,20-10-2009,1.45,1.45,1.36,1.36,1008700,1.36
NYSE,QTM,19-10-2009,1.44,1.46,1.42,1.43,675500,1.43
NYSE,QTM,16-10-2009,1.46,1.46,1.41,1.43,927500,1.43
NYSE,QTM,15-10-2009,1.45,1.49,1.44,1.47,1340900,1.47
NYSE,QTM,14-10-2009,1.51,1.52,1.45,1.47,2875900,1.47
NYSE,QTM,13-10-2009,1.44,1.47,1.42,1.46,1077300,1.46
NYSE,QTM,12-10-2009,1.52,1.54,1.42,1.45,1730800,1.45
NYSE,QTM,09-10-2009,1.39,1.5,1.39,1.47,2522100,1.47
NYSE,QTM,08-10-2009,1.31,1.47,1.3,1.38,3701200,1.38
NYSE,QTM,07-10-2009,1.27,1.31,1.27,1.3,738900,1.3
NYSE,QTM,06-10-2009,1.25,1.32,1.24,1.27,2059300,1.27
NYSE,QTM,05-10-2009,1.22,1.25,1.21,1.24,1233500,1.24
NYSE,QTM,02-10-2009,1.21,1.29,1.15,1.22,1220200,1.22
NYSE,QTM,01-10-2009,1.25,1.27,1.18,1.23,1299400,1.23
NYSE,QTM,30-09-2009,1.25,1.3,1.23,1.26,1039800,1.26
NYSE,QTM,29-09-2009,1.25,1.3,1.24,1.25,1016900,1.25
NYSE,QTM,28-09-2009,1.19,1.28,1.18,1.27,1229900,1.27
NYSE,QTM,25-09-2009,1.16,1.19,1.14,1.18,728800,1.18
NYSE,QTM,24-09-2009,1.21,1.24,1.14,1.16,1057300,1.16
NYSE,QTM,23-09-2009,1.25,1.28,1.21,1.21,903700,1.21
NYSE,QTM,22-09-2009,1.25,1.27,1.22,1.24,670900,1.24
NYSE,QTM,21-09-2009,1.26,1.27,1.22,1.23,999900,1.23
NYSE,QTM,18-09-2009,1.28,1.29,1.24,1.27,1466500,1.27
NYSE,QTM,17-09-2009,1.25,1.29,1.22,1.28,1140400,1.28
NYSE,QTM,16-09-2009,1.21,1.28,1.2,1.25,1321300,1.25
NYSE,QTM,15-09-2009,1.17,1.24,1.15,1.2,2024700,1.2
NYSE,QTM,14-09-2009,1.16,1.18,1.14,1.17,555600,1.17
NYSE,QTM,11-09-2009,1.2,1.22,1.15,1.17,600400,1.17
NYSE,QTM,10-09-2009,1.17,1.23,1.16,1.2,1014900,1.2
NYSE,QTM,09-09-2009,1.15,1.19,1.13,1.17,607000,1.17
NYSE,QTM,08-09-2009,1.17,1.18,1.11,1.14,1166900,1.14
NYSE,QTM,04-09-2009,1.13,1.18,1.12,1.17,640400,1.17
NYSE,QTM,03-09-2009,1.14,1.15,1.12,1.14,500700,1.14
NYSE,QTM,02-09-2009,1.15,1.2,1.1,1.14,746000,1.14
NYSE,QTM,01-09-2009,1.19,1.23,1.13,1.16,1123200,1.16
NYSE,QTM,31-08-2009,1.17,1.23,1.15,1.23,2709900,1.23
NYSE,QTM,28-08-2009,1.23,1.24,1.17,1.2,876600,1.2
NYSE,QTM,27-08-2009,1.15,1.22,1.12,1.22,1162500,1.22
NYSE,QTM,26-08-2009,1.16,1.19,1.13,1.16,759800,1.16
NYSE,QTM,25-08-2009,1.19,1.21,1.15,1.16,1218200,1.16
NYSE,QTM,24-08-2009,1.2,1.2,1.16,1.2,1811900,1.2
NYSE,QTM,21-08-2009,1.15,1.18,1.11,1.17,1939200,1.17
NYSE,QTM,20-08-2009,1.05,1.15,1.04,1.12,1940000,1.12
NYSE,QTM,19-08-2009,1.01,1.05,1,1.05,688400,1.05
NYSE,QTM,18-08-2009,1,1.04,1,1.02,551200,1.02
NYSE,QTM,17-08-2009,1.01,1.02,0.97,0.99,1208700,0.99
NYSE,QTM,14-08-2009,1.06,1.07,1.02,1.05,1189300,1.05
NYSE,QTM,13-08-2009,1,1.1,0.99,1.08,2659000,1.08
NYSE,QTM,12-08-2009,0.97,0.99,0.95,0.99,1771900,0.99
NYSE,QTM,11-08-2009,0.97,0.98,0.94,0.96,716200,0.96
NYSE,QTM,10-08-2009,0.91,0.97,0.91,0.97,856900,0.97
NYSE,QTM,07-08-2009,0.96,0.97,0.85,0.92,1311000,0.92
NYSE,QTM,06-08-2009,0.96,0.96,0.91,0.95,596200,0.95
NYSE,QTM,05-08-2009,0.91,0.95,0.89,0.95,1220200,0.95
NYSE,QTM,04-08-2009,0.9,0.94,0.85,0.9,1503300,0.9
NYSE,QTM,03-08-2009,0.93,0.95,0.89,0.91,1809200,0.91
NYSE,QTM,31-07-2009,0.93,0.97,0.92,0.92,850000,0.92
NYSE,QTM,30-07-2009,0.93,0.98,0.93,0.94,758500,0.94
NYSE,QTM,29-07-2009,1,1,0.92,0.92,1766000,0.92
NYSE,QTM,28-07-2009,0.99,1.04,0.95,1.01,1441300,1.01
NYSE,QTM,27-07-2009,0.97,0.99,0.95,0.96,778300,0.96
NYSE,QTM,24-07-2009,0.95,1,0.93,0.97,443700,0.97
NYSE,QTM,23-07-2009,1.02,1.04,0.94,0.98,1808400,0.98
NYSE,QTM,22-07-2009,0.95,0.99,0.93,0.99,636900,0.99
NYSE,QTM,21-07-2009,1.01,1.01,0.96,0.97,416700,0.97
NYSE,QTM,20-07-2009,1.04,1.05,1,1.01,838500,1.01
NYSE,QTM,17-07-2009,0.95,1.05,0.95,1.03,923800,1.03
NYSE,QTM,16-07-2009,1.04,1.05,0.98,1.02,869800,1.02
NYSE,QTM,15-07-2009,0.99,1.05,0.97,1.05,1845900,1.05
NYSE,QTM,14-07-2009,0.92,0.99,0.91,0.99,978200,0.99
NYSE,QTM,13-07-2009,0.84,0.92,0.83,0.92,821200,0.92
NYSE,QTM,10-07-2009,0.82,0.83,0.79,0.83,404000,0.83
NYSE,QTM,09-07-2009,0.88,0.88,0.82,0.82,665500,0.82
NYSE,QTM,08-07-2009,0.88,0.89,0.79,0.87,1068800,0.87
NYSE,QTM,07-07-2009,0.89,0.9,0.86,0.88,494000,0.88
NYSE,QTM,06-07-2009,0.92,0.92,0.85,0.89,695300,0.89
NYSE,QTM,02-07-2009,0.94,0.96,0.9,0.91,846300,0.91
NYSE,QTM,01-07-2009,0.89,0.99,0.89,0.98,2543300,0.98
NYSE,QTM,30-06-2009,0.85,0.89,0.83,0.83,683700,0.83
NYSE,QTM,29-06-2009,0.85,0.88,0.81,0.85,613400,0.85
NYSE,QTM,26-06-2009,0.88,0.88,0.79,0.79,2731700,0.79
NYSE,QTM,25-06-2009,0.81,0.88,0.8,0.88,437600,0.88
NYSE,QTM,24-06-2009,0.82,0.82,0.78,0.81,581500,0.81
NYSE,QTM,23-06-2009,0.8,0.82,0.76,0.8,528000,0.8
NYSE,QTM,22-06-2009,0.85,0.85,0.8,0.8,702400,0.8
NYSE,QTM,19-06-2009,0.83,0.88,0.82,0.85,1002500,0.85
NYSE,QTM,18-06-2009,0.87,0.89,0.8,0.83,1062200,0.83
NYSE,QTM,17-06-2009,0.92,0.92,0.86,0.86,698300,0.86
NYSE,QTM,16-06-2009,0.95,0.95,0.91,0.92,687100,0.92
NYSE,QTM,15-06-2009,0.95,0.96,0.92,0.94,1067700,0.94
NYSE,QTM,12-06-2009,0.92,0.96,0.91,0.95,881300,0.95
NYSE,QTM,11-06-2009,0.9,0.94,0.9,0.92,1183900,0.92
NYSE,QTM,10-06-2009,0.93,0.95,0.87,0.89,1526100,0.89
NYSE,QTM,09-06-2009,0.9,0.95,0.87,0.9,2633000,0.9
NYSE,QTM,08-06-2009,0.98,0.98,0.85,0.86,2143400,0.86
NYSE,QTM,05-06-2009,0.97,0.99,0.95,0.96,1026000,0.96


A Hive table can be accessed in HBase (cross reference) or using Hbase as data store for hive table

A Hive table can be accessed in HBase (cross reference) or using Hbase as data store for hive table

---------------------------------
The data file:(in Hdfs named as Customer.csv)

John,Wade,A,416-776-9281,2938,Bigcity|12345
Franklin,Josephs,I,905-560-9887,2913,Mediumcity|67890
Elizabeth,Reynolds,A,647-908-8865,2891,Smallcity|98765
Arthur,Gibbons,A,416-238-7765,400,Bigcity|12345
Lisa,Scott,I,416-824-8866,402,Bigcity|12345
Cliff,Bowden,A,905-994-9982,3772,Mediumcity|67890
Jean,Dexter,A,416-774-2339,210,Bigcity|12345
Roger,Vickers,A,647-232-9987,234,Smallcity|98765
Hubert,Lexter,A,416-972-2348,109,Bigcity|12345
Jack,Merdec,A,905-216-0989,54,Mediumcity|67890
Jovan,Melcic,A,905-324-4456,92,Mediumcity|67890
Melanie ,Ilyenko,I,416-988-7723,404,Bigcity|12345
Jillian,Panelo,A,905-498-8872,12,Mediumcity|67890
Michael,Junielle,A,416-209-9987,43,Bigcity|12345
Rodson,Mello,A,416-996-7654,220,Bigcity|12345
Erick,Jansen,A,905-498-8211,93,Mediumcity|67890
Grant,Getnet,A,416-298-3445,332,Bigcity|12345
Roscoe,Banhent,A,647-829-0223,338,Smallcity|98765
Allen,Serghert,A,416-344-0992,324,Bigcity|12345
John,Merdec,I,416-922-2331,55,Bigcity|12345
Franklin,Melcic,A,905-561-2330,647,Mediumcity|67890
Elizabeth,Ilyenko,A,416-354-2778,102,Bigcity|12345
Arthur,Panelo,A,905-276-9987,227,Mediumcity|67890
Lisa,Junielle,A,416-352-9992,323,Bigcity|12345
Cliff,Mello,A,416-210-9997,47,Bigcity|12345
Jean,Jansen,A,905-244-4588,431,Mediumcity|67890
Roger,Getnet,I,416-309-9982,24,Bigcity|12345
Hubert,Banhent,A,416-526-8888,19,Bigcity|12345
Gerhart,Serghert,A,905-561-9234,773,Mediumcity|67890
Lambert,Givens,A,416-209-8223,587,Bigcity|12345
Panelo,Wade,I,905-298-9992,922,Mediumcity|67890
Junielle,Josephs,A,905-737-9088,433,Mediumcity|67890
Mello,Reynolds,A,905-245-4431,64,Mediumcity|67890
Jansen,Gibbons,A,416-298-3881,895,Bigcity|12345
Getnet,Scott,A,905-309-8221,1993,Mediumcity|67890
Banhent,Bowden,A,416-989-0223,720,Bigcity|12345
Serghert,Dexter,A,416-823-0991,830,Bigcity|12345
Merdec,Vickers,I,416-298-0908,176,Bigcity|12345
Melcic,Lexter,A,416-823-4443,128,Bigcity|12345
Ilyenko,Merdec,A,416-293-8771,97,Bigcity|12345
Roscoe,Morris,A,905-455-3221,322,Mediumcity|67890
Allen,Jaskobec,A,416-299-0202,7,Bigcity|12345
John,Jarkin,A,416-622-0991,11,Bigcity|12345
Franklin,Drill,A,647-309-2331,37,Smallcity|98765
Elizabeth,Metzer,A,416-322-9001,85,Bigcity|12345
Arthur,Balgne,A,905-311-1211,331,Mediumcity|67890
Lisa,Hetzer,A,416-980-3229,482,Bigcity|12345
Cliff,Brandson,I,416-559-0223,3221,Bigcity|12345
Jean,Kulinski,A,905-409-8823,452,Mediumcity|67890
Roger,Marjory,A,647-290-6776,557,Smallcity|98765
Hubert,Mentz,A,416-577-8233,312,Bigcity|12345
___________________________________________________________________________
Go to Hive shell:
create a table 
hive>create table hivetohb_customers(first_name string, last_name string , category string,phone_number string , customer_id int ,address Array<String>)
    > STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'                                                                             
    > WITH SERDEPROPERTIES("hbase.columns.mapping"="nm:fn,nm:ln,pd:cat,pd:phno,:key,pd:addr")                                                  
    > TBLPROPERTIES("hbase.table.name"="hv_cust");
-----------------------------------------------------
once successful execution of above command in hive , a table of name hv_cust will be created in HBase.(Go to HBase shell and hbase>list  {watch for the table hv_cust})
_________________________________________________________________________
in hive shell
create a temp table 
1)
hive>create table customers_temp(first_name string, last_name string , category string,phone_number string , customer_id int ,address Array<String>)
    > row format delimited
    > fields terminated by ','
    > collection items terminated by '|'
    > stored as textfile;

2)
hive>load data inpath '/user/hadoop/Customer.csv' overwrite into table customers_temp;  

3)hive> from customers_temp insert overwrite table hivetohb_customers select *
    > ;
4)
If you try to load the data directly into table hivetohb_customers you will get an error .
hive> load data inpath '/user/hadoop/Customer.csv' overwrite into table customers;
FAILED: SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD

5) select * from hivetohb_customers; (To see the results)

_________________________________________________________________
Go to $HBASE_HOME/bin>hbase shell
hbase>scan 'hv_cust ' (will show the results in hbase where data is present in
hive)


A tab separated file data load into HBase from Hdfs/And Access the HBase table from Hive table(cross reference) .

 A tab separated file data load into HBase from Hdfs/And Access the HBase table from Hive table(cross reference) (A hands on explanation).

--------------------------------------------
Load the tab separated file into Hdfs :
cli>hdfs dfs -put /home/hadoop/Desktop/Customer.csv
______
Go to $HBASE_HOME/bin

The table pig_cust1 should be in hbase before executing the below command

HBASE_HOME/bin>hbase shell

hbase>create 'hb_cust1' ,'nm','pd'
hbase>list
hbase>describe 'hb_cust1 '
hbase>exit

Then execute the following command
HBASE_HOME/bin>

./hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.columns=nm:fname,nm:lname,pd:cat,pd:ph,
HBASE_ROW_KEY,pd:addr 'hb_cust1'
hdfs://localhost:8020/user/hadoop/pigout/part-m-00000
-----------------------------------------------------------------------------------
the data file :

John    Wade    A    416-776-9281    2938    Bigcity|12345
Franklin    Josephs    I    905-560-9887    2913    Mediumcity|67890
Elizabeth    Reynolds    A    647-908-8865    2891    Smallcity|98765
Arthur    Gibbons    A    416-238-7765    400    Bigcity|12345
Lisa    Scott    I    416-824-8866    402    Bigcity|12345
Cliff    Bowden    A    905-994-9982    3772    Mediumcity|67890
Jean    Dexter    A    416-774-2339    210    Bigcity|12345
Roger    Vickers    A    647-232-9987    234    Smallcity|98765
Hubert    Lexter    A    416-972-2348    109    Bigcity|12345
Jack    Merdec    A    905-216-0989    54    Mediumcity|67890
Jovan    Melcic    A    905-324-4456    92    Mediumcity|67890
Melanie     Ilyenko    I    416-988-7723    404    Bigcity|12345
Jillian    Panelo    A    905-498-8872    12    Mediumcity|67890
Michael    Junielle    A    416-209-9987    43    Bigcity|12345
Rodson    Mello    A    416-996-7654    220    Bigcity|12345
Erick    Jansen    A    905-498-8211    93    Mediumcity|67890
Grant    Getnet    A    416-298-3445    332    Bigcity|12345
Roscoe    Banhent    A    647-829-0223    338    Smallcity|98765
Allen    Serghert    A    416-344-0992    324    Bigcity|12345
John    Merdec    I    416-922-2331    55    Bigcity|12345
Franklin    Melcic    A    905-561-2330    647    Mediumcity|67890
Elizabeth    Ilyenko    A    416-354-2778    102    Bigcity|12345
Arthur    Panelo    A    905-276-9987    227    Mediumcity|67890
Lisa    Junielle    A    416-352-9992    323    Bigcity|12345
Cliff    Mello    A    416-210-9997    47    Bigcity|12345
Jean    Jansen    A    905-244-4588    431    Mediumcity|67890
Roger    Getnet    I    416-309-9982    24    Bigcity|12345
Hubert    Banhent    A    416-526-8888    19    Bigcity|12345
Gerhart    Serghert    A    905-561-9234    773    Mediumcity|67890
Lambert    Givens    A    416-209-8223    587    Bigcity|12345
Panelo    Wade    I    905-298-9992    922    Mediumcity|67890
Junielle    Josephs    A    905-737-9088    433    Mediumcity|67890
Mello    Reynolds    A    905-245-4431    64    Mediumcity|67890
Jansen    Gibbons    A    416-298-3881    895    Bigcity|12345
Getnet    Scott    A    905-309-8221    1993    Mediumcity|67890
Banhent    Bowden    A    416-989-0223    720    Bigcity|12345
Serghert    Dexter    A    416-823-0991    830    Bigcity|12345
Merdec    Vickers    I    416-298-0908    176    Bigcity|12345
Melcic    Lexter    A    416-823-4443    128    Bigcity|12345
Ilyenko    Merdec    A    416-293-8771    97    Bigcity|12345
Roscoe    Morris    A    905-455-3221    322    Mediumcity|67890
Allen    Jaskobec    A    416-299-0202    7    Bigcity|12345
John    Jarkin    A    416-622-0991    11    Bigcity|12345
Franklin    Drill    A    647-309-2331    37    Smallcity|98765
Elizabeth    Metzer    A    416-322-9001    85    Bigcity|12345
Arthur    Balgne    A    905-311-1211    331    Mediumcity|67890
Lisa    Hetzer    A    416-980-3229    482    Bigcity|12345
Cliff    Brandson    I    416-559-0223    3221    Bigcity|12345
Jean    Kulinski    A    905-409-8823    452    Mediumcity|67890
Roger    Marjory    A    647-290-6776    557    Smallcity|98765
Hubert    Mentz    A    416-577-8233    312    Bigcity|12345
---------------------------------------------------------------------------
once executed go to HBase shell
hbase>list
hbase>scan hb_cust1
(to see the rows)
--------------------------------------------------------------------------
Because the data is already present in HBase table you can access it in Hive only through hive external table declaration

Follow the following steps :
1)Go to $HIVE_HOME/bin
2)$HIVE_HOME/bin>hive (shell)
3)At hive shell
hive>create external table hbtohive_customers(first_name string, last_name string , category string,phone_number string , customer_id int ,address Array<String>)
    > STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'                                                                             
    > WITH SERDEPROPERTIES("hbase.columns.mapping"="nm:fname,nm:lname,pd:cat,pd:ph,:key,pd:addr")                                                  
    > TBLPROPERTIES("hbase.table.name"="hb_cust1");
________________________________________________
In hive> select * from hbtohive_customers; will display the results.

A Hive Thriftserver example


A hive Thriftserver example.(hands on lab)

-----------------------------------------
Create java file in eclipse with name
The necessary jar files go to $HIVE_HOME/lib and search for libraries  and add to the below program .Slf4j-api-1.6.1.jar
--go to $HADOOP_HOME   search for necessary libraries and add them too.
________________________
ThriftHive.java(add all the jar in the build configuration step to the program)
--------------------------
import java.sql.SQLException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.sql.DriverManager;
 
public class HiveJdbcClient {
  private static String driverName = "org.apache.hadoop.hive.jdbc.HiveDriver";
 
  public static void main(String[] args) throws SQLException {
    try {
      Class.forName(driverName);
     System.out.println("try block");
    } catch (ClassNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      System.exit(1);
    }
    Connection con = DriverManager.getConnection("jdbc:hive://localhost:10000/default", "", "");
    Statement stmt = con.createStatement();
    String tableName = "test";
    stmt.executeQuery("drop table " + tableName);
    ResultSet res = stmt.executeQuery("create table " + tableName + " (key int, value string)");
    // show tables
    String sql = "show tables '" + tableName + "'";
    System.out.println("Running: " + sql);
    res = stmt.executeQuery(sql);
    if (res.next()) {
      System.out.println(res.getString(1));
    }
    // describe table
    sql = "describe " + tableName;
    System.out.println("Running: " + sql);
    res = stmt.executeQuery(sql);
    while (res.next()) {
      System.out.println(res.getString(1) + "\t" + res.getString(2));
    }
     
  }
}

----------------------------------
Go to $HIVE_HOME/bin directory

To start the thrift server please type the below command:
$HIVE_HOME/bin>hive --service hiveserver
-------------
Go to eclipse :
After adding all the Jars we need to run the Java application and we will get the output
________
For every query hard coded in the java code in the eclipse there will be one Ok displayed on output screen.

A Hive UDF example

 A Hive UDF example(hands on example)


DateExpand.java
-----------
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

public class DateExpand extends UDF{

    public Text evaluate(Text t){
        String tempDate1= null;
            if (t == null || t.toString().length() <= 0) {
                       //warn(null, null);
            return null;
            }
            try{
               
                String inStr =(String) t.toString();
                String[] tempDate =inStr.toString().split("-");
                int month =Integer.parseInt(tempDate[1].toString());
                String monthString;
                switch(month)
                {
                case 1:  monthString = "January";
                break;
                case 2:  monthString = "February";
                break;
                case 3:  monthString = "March";
                break;
                case 4:  monthString = "April";
                break;
                case 5:  monthString = "May";
                break;
                case 6:  monthString = "June";
                break;
                case 7:  monthString = "July";
                break;
                case 8:  monthString = "August";
                break;
                case 9:  monthString = "September";
                break;
                case 10: monthString = "October";
                break;
                case 11: monthString = "November";
                break;
                case 12: monthString = "December";
                break;
                default: monthString = "Invalid month";
                break;
                }
           
                int day1 =Integer.parseInt(tempDate[2].toString());
                String dayString =null;
                int tempWeek = 0;
                if (day1 >0 & day1<=7){tempWeek =1;}
                if (day1 >7 & day1<=14){tempWeek =2;}
                if (day1 >14 & day1<=21){tempWeek =3;}
                if (day1 >21){tempWeek =4;}
                switch(tempWeek)
                {
                case 1:  dayString = "First week of:"+ monthString.toString();
                break;
                case 2:  dayString = "Second week of :"+ monthString.toString();
                break;
                case 3:  dayString ="Third week of:  "+ monthString.toString();
                break;
                case 4:  dayString = "Fourth week of: "+ monthString.toString();
                break;
                default: dayString = "Invalid day specified";
                break;
                }
                    tempDate1 =t.toString()+":"+ dayString.toString()+" "+ tempDate[0].toString();       
                    return new Text(tempDate1);
                   
                }catch(Exception e)
                {
                    //e.printStackTrace();
                    return new Text(e.toString());
                   
                }
                   
            }
   
}
________
1)export the jar file with libraries DateExpand.jar to  a desired location (example Desktop)

2) go to $HIVE_HOME/bin
execute
$HIVE_HOME/bin>hive(hive shell)



3)in hive shell
hive>show databases;
4)hive>use dafault;
5)hive>show tables;(locate where  sales table is present)
_______
6)create a DateExpand.q script
At $HIVE_HOME>gedit DateExpand.q(add below commands and save it on desktop)

add jar /home/hadoop/Desktop/DateExpand.jar

use default;
 CREATE TEMPORARY FUNCTION dateexpand AS 'DateExpand';
insert overwrite directory  '/user/hadoop/hivesalesout' select cust_id,prod_no,qty,dateexpand(datestring),sales_id from sales;
______
7) execute
go to $HIVE_HOME/bin>./hive -f  /home/hadoop/Desktop/DateExpand.q

______
Data file of sales
587    87634    1    2012-01-09    34823
922    88734    1    2012-01-09    34824
433    99207    2    2012-01-09    34825
64    98243    1    2012-01-09    34826
922    77623    3    2012-01-09    34827
922    88734    24    2012-01-09    34828
331    282009    2    2012-01-09    34829
482    87634    1    2012-01-09    34830
3221    92387    15    2012-01-09    34831
452    282299    2    2012-01-09    34832
64    77624    17    2012-01-09    34833
895    88211    31    2012-01-09    34834
1993    92387    2    2012-01-09    34835
720    282009    2    2012-01-09    34836
830    282299    1    2012-01-09    34837
176    77623    1    2012-01-09    34838
128    88734    4    2012-01-09    34839
97    99202    1    2012-01-09    34840
322    99202    6    2012-01-09    34841
7    98243    1    2012-01-24    34842
11    77623    2    2012-01-24    34843
482    88734    1    2012-01-24    34844
3221    282009    1    2012-01-24    34845
452    99202    23    2012-01-24    34846
64    92387    4    2012-01-24    34847
895    282009    7    2012-01-24    34848
1993    92387    3    2012-01-24    34849
720    99207    1    2012-01-24    34850
102    98243    1    2012-01-24    34851
227    77623    1    2012-01-24    34852
323    99207    2    2012-01-24    34853
47    92387    1    2012-01-24    34854
431    87659    1    2012-01-24    34855
24    45641    5    2012-01-24    34856
19    88734    7    2012-01-24    34857
773    45641    4    2012-01-24    34858
647    45628    2    2012-01-24    34859
773    45628    10    2012-01-24    34860
Time taken: 0.111 seconds, Fetched: 38 row(s)
_____________________________
check the output in the directory of hdfs :/user/hadoop/hivesalesout


A Import from MySQL to HBase

 A Import from MySQL to HBase (hands on excercise explanation)

 -----
Make sure your HBase is running
 Go to $HBASE_HOME/bin
execute the following commands
 $HBASE_HOME/bin >start-hbase.sh
$HBASE_HOME/bin >hbase shell

hbase> list_namespace_tables 'default'

to see the list of tables;
___________

Go to $SQOOP_HOME/bin>./sqoop import --connect jdbc:mysql://192.x.x.x/sakila --table actor  --username <...> --password<...>
--hbase-table actor --column-family pd
--hbase-row-key actor_id --hbase-create-table - m 1
_________________
This above command will create a table in hbase default database and loads the data from MySQL table "actor"

A export from HIVE to MYSQL using sqoop

 A export from HIVE to MYSQL using sqoop(hands on example explanation)

---------------------------------------
 In MYSQL  command line :
mysql>create table sales(custid int, prodno int, quantity int, datestring varchar(20),salesid int);

check to see the table is created :
mysql>describe sales;
---------------------------
Go to $SQOOP_HOME/bin directory 

$SQOOP_HOME/bin> ./sqoop export  --connect jdbc:mysql://192.x.x.x/test
--table sales  --username biadmin  --password  biadmin
 --export-dir /user/hive/warehouse/customersales.db/sales/Sales.csv
 --input-fields-terminated-by , --input-lines-terminated-by \n -m 1



___
1)test is the database in mysql, -- table sales is sales table in mysql
2)export-dir is the data file location in hdfs
3) --username biadmin  --password  biadmin  is the username and password to connect to mysql database
4)--input-fields-terminated-by , --input-lines-terminated-by \n  :in data file Sales.csv fields are terminated by , and end of line terminated by /n
------------
Once you have loaded go to myql
mysql> select * from sales; and check the rows
------
Even when the mysql table "sales" is short of columns count in  "Sales.csv" present in  hdfs , the above statement still works but it loads only the columns that you described from left to right , leaving others away.




A UDF Example2 in pig

 Problem statement encrypt the sensitive fields in data files using A UDF Example2 in  pig


Go to $PIG_HOME/bin>./pig -x local /home/hadoop/Desktop/DeIdenUDF1.pig

and run it
___________________________________
A java code for UDF . Export to jar file. 

import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;

import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;

import org.apache.commons.codec.binary.Base64;
import org.apache.pig.EvalFunc;
import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;

public class DeIdentifyUDF extends EvalFunc<String> {

    @Override
    public String exec(Tuple input) throws IOException {
         if (input == null || input.size() < 2) {
                warn("invalid number of arguments to DEIDENTIFY", PigWarning.UDF_WARNING_1);
                return null;
            }
            try {
                String plainText = (String)input.get(0);
                String encryptKey = (String)input.get(1);
                String str="";
                try {
                    str = encrypt(plainText,encryptKey.getBytes());
                }  catch (NoSuchPaddingException e) {
                    // TODO Auto-generated catch block
                    str="NoSuchPaddingException";
                    e.printStackTrace();
                } catch (IllegalBlockSizeException e) {
                    // TODO Auto-generated catch block
                    str="IllegalBlockSizeException";
                    e.printStackTrace();
                } catch (BadPaddingException e) {
                    // TODO Auto-generated catch block
                    str="BadPaddingException";
                    e.printStackTrace();
                }
                catch (InvalidKeyException e) {
                    // TODO Auto-generated catch block
                    str="InvalidKeyException";
                    e.printStackTrace();
                } catch (NoSuchAlgorithmException e) {
                    // TODO Auto-generated catch block
                    str="NoSuchAlgorithmException";
                    e.printStackTrace();
                }
                return str;
            } 
            catch (NullPointerException npe) {
                warn(npe.toString(), PigWarning.UDF_WARNING_2);
                return null;
            } catch (StringIndexOutOfBoundsException npe) {
                warn(npe.toString(), PigWarning.UDF_WARNING_3);
                return null;
            } catch (ClassCastException e) {
                warn(e.toString(), PigWarning.UDF_WARNING_4);
                return null;
            }
            
    }
     private String encrypt(String strToEncrypt, byte[] key) throws NoSuchAlgorithmException, NoSuchPaddingException, InvalidKeyException, IllegalBlockSizeException, BadPaddingException 
        {
                Cipher cipher = Cipher.getInstance("AES/ECB/PKCS5Padding");
                SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
                cipher.init(Cipher.ENCRYPT_MODE, secretKey);
                String encryptedString = Base64.encodeBase64String(cipher.doFinal(strToEncrypt.getBytes()));
                System.out.println("------------encryptedString"+encryptedString);
                return encryptedString.trim();
        }
     
     
}_________________________________________________________________________________
A pig script file on desktop DeIdenUDF1.pig

REGISTER /home/hadoop/Desktop/DeIdenUDF1.jar;

A = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset2.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);

/*B = LOAD '/home/hadoop/Desktop/healthcare_Sample_dataset1.csv' using PigStorage(',') AS (PatientID: int, Name: chararray, DOB: chararray, PhoneNumber: chararray, EmailAddress: chararray, SSN: chararray, Gender: chararray, Disease: chararray, weight: float);*/

C = UNION A, B;

/*D = FOREACH C GENERATE PatientID, depig.DeIdentifyUDF(Name,'12345678abcdefgh'), depig.DeIdentifyUDF(DOB,'12345678abcdefgh'), depig.DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'),
depig.DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),depig.DeIdentifyUDF(SSN,'12345678abcdefgh'), depig.DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;*/

D = FOREACH C GENERATE PatientID, DeIdentifyUDF(Name,'12345678abcdefgh'), DeIdentifyUDF(DOB,'12345678abcdefgh'), DeIdentifyUDF(PhoneNumber,'12345678abcdefgh'), DeIdentifyUDF(EmailAddress,'12345678abcdefgh'),DeIdentifyUDF(SSN,'12345678abcdefgh'), DeIdentifyUDF(Disease,'12345678abcdefgh'),weight;

STORE D into '/home/hadoop/Desktop/deidentifiedDir';

----------------------------------------------------------------------------------------------------------------------------------------
 A dataset one
______________
11111,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11112,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11113,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11114,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11115,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11116,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11117,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11118,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11119,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11120,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11121,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11122,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11123,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11124,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11125,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11126,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11127,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11128,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11129,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11130,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11131,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11132,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11133,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11134,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11135,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11136,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11137,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
11138,aaa1,12/10/1950,1234567890,aaa1@xxx.com,1111111111,M,Diabetes,78
11139,aaa2,12/10/1984,1234567890,aaa2@xxx.com,1111111111,F,PCOS,67
11140,aaa3,712/11/1940,1234567890,aaa3@xxx.com,1111111111,M,Fever,90
11141,aaa4,12/12/1950,1234567890,aaa4@xxx.com,1111111111,F,Cold,88
11142,aaa5,12/13/1960,1234567890,aaa5@xxx.com,1111111111,M,Blood Pressure,76
11143,aaa6,12/14/1970,1234567890,aaa6@xxx.com,1111111111,F,Malaria,84
11144,aaa7,12/15/1980,1234567890,aaa7@xxx.com,1111111111,M,Swine Flu,64
11145,aaa8,12/16/1990,1234567890,aaa8@xxx.com,1111111111,F,Fever,33
11146,aaa9,12/17/2000,1234567890,aaa9@xxx.com,1111111111,F,Fever,29
___________
a second data set 

 11111,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11112,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11113,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11114,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11115,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11116,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11117,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11118,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11119,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11120,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11121,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11122,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11123,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11124,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11125,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11126,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11127,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11128,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11129,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11130,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11131,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11132,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11133,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11134,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11135,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11136,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11137,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29
11138,bbb1,12-10-1950,1234567890,bbb1@xxx.com,1111111111,M,Diabetes,78
11139,bbb2,12-10-1984,1234567890,bbb2@xxx.com,1111111111,F,PCOS,67
11140,bbb3,712/11/1940,1234567890,bbb3@xxx.com,1111111111,M,Fever,90
11141,bbb4,12-12-1950,1234567890,bbb4@xxx.com,1111111111,F,Cold,88
11142,bbb5,12/13/1960,1234567890,bbb5@xxx.com,1111111111,M,Blood Pressure,76
11143,bbb6,12/14/1970,1234567890,bbb6@xxx.com,1111111111,F,Malaria,84
11144,bbb7,12/15/1980,1234567890,bbb7@xxx.com,1111111111,M,Swine Flu,64
11145,bbb8,12/16/1990,1234567890,bbb8@xxx.com,1111111111,F,Fever,33
11146,bbb9,12/17/2000,1234567890,bbb9@xxx.com,1111111111,F,Fever,29

A User Defined Function In Pig

 Problem Statement : Sales data in Sales.csv ,read the data and display the date column in readable format by writing UDF function in pig


convdate.pig file on desktop. 
_____________________________
convdate.pig

REGISTER /home/hadoop/Desktop/ConvDate.jar;
salesdt = LOAD '/home/hadoop/Desktop/Sales.csv' using PigStorage(',') AS (cust_id:int,prod_num:int,quan_sold:int,dt1:chararray,sales_id:int);
dateexp = FOREACH salesdt GENERATE cust_id,prod_num,quan_sold,ConvDate(dt1),sales_id ;
--AS (cust_id1:int,prod_num1:int,quan_sold1:int,dateinfull:chararray,sales_id1:int);
STORE  dateexp into '/home/hadoop/Desktop/convdtout';

_______________________________________________
create a java project /a package and create a class  with name ConvDate as shown below.
Build path with necessary jar file that you plan on using in ConvDate.java
-------
mport java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.WrappedIOException;


public class ConvDate extends EvalFunc<String>{

    @SuppressWarnings("deprecation")
    @Override
    public String exec(Tuple input) throws IOException {
       
        if (input == null || input.size() <= 0) {
            warn("invalid number of arguments to DEIDENTIFY", PigWarning.UDF_WARNING_1);
            //warn(null, null);
            return null;
        }
            try{
                String date= null;
                String inStr =(String) input.get(0);
                String[] tempDate =inStr.toString().split("-");
                int month =Integer.parseInt(tempDate[1].toString());
                String monthString;
                switch(month)
                {
                case 1:  monthString = "January";
                break;
                case 2:  monthString = "February";
                break;
                case 3:  monthString = "March";
                break;
                case 4:  monthString = "April";
                break;
                case 5:  monthString = "May";
                break;
                case 6:  monthString = "June";
                break;
                case 7:  monthString = "July";
                break;
                case 8:  monthString = "August";
                break;
                case 9:  monthString = "September";
                break;
                case 10: monthString = "October";
                break;
                case 11: monthString = "November";
                break;
                case 12: monthString = "December";
                break;
                default: monthString = "Invalid month";
                break;
                }
           
                int day =Integer.parseInt(tempDate[2].toString());
                String dayString =null;
                int week = 0;
                if (day >0 & day<=7){week =1;}
                if (day >7 & day<=14){week =2;}
                if (day >14 & day<=21){week =3;}
                if (day >21){week =4;}
                switch(week)
                {
                case 1:  dayString = "First week of:"+ monthString.toString();
                break;
                case 2:  dayString = "Second week of :"+ monthString.toString();
                break;
                case 3:  dayString ="Third week of:  "+ monthString.toString();
                break;
                case 4:  dayString = "Fourth week of: "+ monthString.toString();
                break;
                default: dayString = "Invalid day specified";
                break;
                }
                    date =input.toString()+":"+ dayString.toString()+ tempDate[0].toString()+":";       
                    return date;
            }catch(Exception e)
            {
                throw WrappedIOException.wrap(
                        "Caught exception processing input row ", e);
            }
        }

}

Export the ConvDate.Jar to the desired location(here Desktop) .
>Jar tf ConvDate.jar to check it out
-----------------------------------------------------------------------------------------------------------------------------

Go to $PIG_HOME directory
change to bin directory

Execute this command
$PIG_HOME/bin>./pig -x local  $PIG_HOME/convdate.pig
___________________________________________________________________
Sales.csv file
587,87634,1,2012-01-09,34823
922,88734,1,2012-01-09,34824
433,99207,2,2012-01-09,34825
64,98243,1,2012-01-09,34826
922,77623,3,2012-01-09,34827
922,88734,24,2012-01-09,34828
331,282009,2,2012-01-09,34829
482,87634,1,2012-01-09,34830
3221,92387,15,2012-01-09,34831
452,282299,2,2012-01-09,34832
64,77624,17,2012-01-09,34833
895,88211,31,2012-01-09,34834
1993,92387,2,2012-01-09,34835
720,282009,2,2012-01-09,34836
830,282299,1,2012-01-09,34837
176,77623,1,2012-01-09,34838
128,88734,4,2012-01-09,34839
97,99202,1,2012-01-09,34840
322,99202,6,2012-01-09,34841
7,98243,1,2012-01-24,34842
11,77623,2,2012-01-24,34843
482,88734,1,2012-01-24,34844
3221,282009,1,2012-01-24,34845
452,99202,23,2012-01-24,34846
64,92387,4,2012-01-24,34847
895,282009,7,2012-01-24,34848
1993,92387,3,2012-01-24,34849
720,99207,1,2012-01-24,34850
102,98243,1,2012-01-24,34851
227,77623,1,2012-01-24,34852
323,99207,2,2012-01-24,34853
47,92387,1,2012-01-24,34854
431,87659,1,2012-01-24,34855
24,45641,5,2012-01-24,34856
19,88734,7,2012-01-24,34857
773,45641,4,2012-01-24,34858
647,45628,2,2012-01-24,34859
773,45628,10,2012-01-24,34860
_________________________________________________
Pig Output as
587    87634    1    (2012-01-09):Second week of :January2012:    34823
922    88734    1    (2012-01-09):Second week of :January2012:    34824
433    99207    2    (2012-01-09):Second week of :January2012:    34825
64    98243    1    (2012-01-09):Second week of :January2012:    34826
922    77623    3    (2012-01-09):Second week of :January2012:    34827
922    88734    24    (2012-01-09):Second week of :January2012:    34828
331    282009    2    (2012-01-09):Second week of :January2012:    34829
482    87634    1    (2012-01-09):Second week of :January2012:    34830
3221    92387    15    (2012-01-09):Second week of :January2012:    34831
452    282299    2    (2012-01-09):Second week of :January2012:    34832
64    77624    17    (2012-01-09):Second week of :January2012:    34833
895    88211    31    (2012-01-09):Second week of :January2012:    34834
1993    92387    2    (2012-01-09):Second week of :January2012:    34835
720    282009    2    (2012-01-09):Second week of :January2012:    34836
830    282299    1    (2012-01-09):Second week of :January2012:    34837
176    77623    1    (2012-01-09):Second week of :January2012:    34838
128    88734    4    (2012-01-09):Second week of :January2012:    34839
97    99202    1    (2012-01-09):Second week of :January2012:    34840
322    99202    6    (2012-01-09):Second week of :January2012:    34841
7    98243    1    (2012-01-24):Fourth week of: January2012:    34842
11    77623    2    (2012-01-24):Fourth week of: January2012:    34843
482    88734    1    (2012-01-24):Fourth week of: January2012:    34844
3221    282009    1    (2012-01-24):Fourth week of: January2012:    34845
452    99202    23    (2012-01-24):Fourth week of: January2012:    34846
64    92387    4    (2012-01-24):Fourth week of: January2012:    34847
895    282009    7    (2012-01-24):Fourth week of: January2012:    34848
1993    92387    3    (2012-01-24):Fourth week of: January2012:    34849
720    99207    1    (2012-01-24):Fourth week of: January2012:    34850
102    98243    1    (2012-01-24):Fourth week of: January2012:    34851
227    77623    1    (2012-01-24):Fourth week of: January2012:    34852
323    99207    2    (2012-01-24):Fourth week of: January2012:    34853
47    92387    1    (2012-01-24):Fourth week of: January2012:    34854
431    87659    1    (2012-01-24):Fourth week of: January2012:    34855
24    45641    5    (2012-01-24):Fourth week of: January2012:    34856
19    88734    7    (2012-01-24):Fourth week of: January2012:    34857
773    45641    4    (2012-01-24):Fourth week of: January2012:    34858
647    45628    2    (2012-01-24):Fourth week of: January2012:    34859
773    45628    10    (2012-01-24):Fourth week of: January2012:    34860

Reading Xml file into PIG .

 Reading Xml file into pig .

----------------------
This should a pig script file with a name for example.

xmread.pig

register /home/hadoop/Desktop/loader-forum.jar
pigdata = load '/home/hadoop/Desktop/pig/Pig_Practicals/xml1.xml' using XMLLoader('name') as (doc:chararray);

store pigdata into '/home/hadoop/Desktop/pigoutputvalues1';
values = foreach pigdata GENERATE FLATTEN(REGEX_EXTRACT_ALL(doc,'<name>(.*)</name>')) AS (name:chararray);

store values into '/home/hadoop/Desktop/pigoutputvalues';
------------------------------------------------------------------------------------------------------------------
--register '/usr/lib/pig-0.12.0/contrib/piggybank/java/piggybank.jar'

register /home/hadoop/Desktop/loader-forum.jar

pigdata = load '/home/hadoop/Desktop/Pig_Practicals/xml2.xml' USING XMLLoader('Property') as (doc:chararray);

store pigdata into '/home/hadoop/Desktop/pigoutputvalues1';

values = foreach pigdata GENERATE FLATTEN(REGEX_EXTRACT_ALL(doc,'<Property>\\s*<fname>(.*)</fname>\\s*<lname>(.*)</lname>\\s*<landmark>(.*)</landmark>\\s*<city>(.*)</city>\\s*<state>(.*)</state>\\s*<contact>(.*)</contact>\\s*<email>(.*)</email>\\s*<PAN_Card>(.*)</PAN_Card>\\s*<URL>(.*)</URL>\\s*</Property>')) AS (fname:chararray, lname:chararray, landmark:chararray, city:chararray, state:chararray, contact:int, email:chararray, PAN_Card:long, URL:chararray);

store values into '/home/hadoop/Desktop/pigoutputvalues';
__________________________
 so the script pig file is on desktop named xmread.pig
The xml1.xml and xml2.xml data files are in path /home/hadoop/Desktop/pig/Pig_Practicals/
____________________________-
 go to $PIG_HOME/bin directory
$PIG_HOME/bin>./pig -x local /home/hadoop/Desktop/xmread.pig

______________________________________
The data is written in path /home/hadoop/Desktop/pigoutputvalues