python数据分析序列——DataFrame merge and apply

原创
2017/03/09 09:10
阅读数 110
import cx_Oracle
import pandas as pd 
import pandas.io.sql as psql
def fetch_all(sql,conn='user/pass@xxx.xx.x.xx:1521/x2'):
    db=cx_Oracle.connect(conn)
    df_ora=psql.read_sql(sql,con=db)
    db.close()
    return df_ora
sem_edu="user/pass@xxx.xx.x.xxx:1521/x1"
df_edu=fetch_all("select count(*) from ba.ba2010 ",sem_edu)

教育版人员数据总量

df_edu
这里输入代码
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>COUNT(*)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>20507</td>
    </tr>
  </tbody>
</table>
</div>
df_pro=fetch_all("select count(*) from ba.ba2010 ")

正式版人员数据总量

df_pro
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>COUNT(*)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>20394</td>
    </tr>
  </tbody>
</table>
</div>
df_edu=fetch_all("select emp_no emp_no1 from ba.ba2010 ",sem_edu)
df_pro=fetch_all("select emp_no emp_no2 from ba.ba2010 ")
df_edu.describe()
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>EMP_NO1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>count</th>
      <td>20507</td>
    </tr>
    <tr>
      <th>unique</th>
      <td>20507</td>
    </tr>
    <tr>
      <th>top</th>
      <td>21206168</td>
    </tr>
    <tr>
      <th>freq</th>
      <td>1</td>
    </tr>
  </tbody>
</table>
</div>
df_pro.describe()
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>EMP_NO2</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>count</th>
      <td>20394</td>
    </tr>
    <tr>
      <th>unique</th>
      <td>20394</td>
    </tr>
    <tr>
      <th>top</th>
      <td>934471</td>
    </tr>
    <tr>
      <th>freq</th>
      <td>1</td>
    </tr>
  </tbody>
</table>
</div>

通过merge操作将两个数据集合进行合并

df_merge=df_edu.merge(df_pro,how='outer',left_on='EMP_NO1',right_on="EMP_NO2")

对空值进行填充处理

df_merge=df_merge.fillna('TUTUTU')

筛选出两个集合的补集

df_merge_filter=df_merge[(df_merge.EMP_NO1=='TUTUTU') | (df_merge.EMP_NO2=='TUTUTU')]
fetch_all("select * from ba.ba2011 t where t.emp_no = '96128754'")
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>EMP_NO</th>
      <th>EMP_DESC</th>
      <th>ORGN_CD</th>
      <th>JO_NO</th>
      <th>JBRP</th>
      <th>JBL</th>
      <th>TRAD</th>
      <th>DIR_INDR_ID</th>
      <th>MH_APLY_PCNT</th>
      <th>MH_APLY_ST_DT</th>
      <th>MH_APLY_FN_DT</th>
      <th>RET_DT</th>
      <th>TEL_NO</th>
      <th>RGST_DT</th>
      <th>RGST_TIME</th>
      <th>RGST_USER_ID</th>
      <th>STUS_CD</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>96128754</td>
      <td>???</td>
      <td>P35400</td>
      <td></td>
      <td>60</td>
      <td>80</td>
      <td>41</td>
      <td>D</td>
      <td>100</td>
      <td></td>
      <td></td>
      <td></td>
      <td></td>
      <td>20170308</td>
      <td>142650</td>
      <td>HR</td>
      <td>0</td>
    </tr>
  </tbody>
</table>
</div>
fetch_all("select * from ba.ba2011 t where t.emp_no = 'A875001'",sem_edu).T
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>EMP_NO</th>
      <td>A875001</td>
    </tr>
    <tr>
      <th>EMP_DESC</th>
      <td>???</td>
    </tr>
    <tr>
      <th>ORGN_CD</th>
      <td>P24307</td>
    </tr>
    <tr>
      <th>JO_NO</th>
      <td></td>
    </tr>
    <tr>
      <th>JBRP</th>
      <td>60</td>
    </tr>
    <tr>
      <th>JBL</th>
      <td>80</td>
    </tr>
    <tr>
      <th>TRAD</th>
      <td>53</td>
    </tr>
    <tr>
      <th>DIR_INDR_ID</th>
      <td>D</td>
    </tr>
    <tr>
      <th>MH_APLY_PCNT</th>
      <td>100</td>
    </tr>
    <tr>
      <th>MH_APLY_ST_DT</th>
      <td></td>
    </tr>
    <tr>
      <th>MH_APLY_FN_DT</th>
      <td></td>
    </tr>
    <tr>
      <th>RET_DT</th>
      <td></td>
    </tr>
    <tr>
      <th>TEL_NO</th>
      <td></td>
    </tr>
    <tr>
      <th>RGST_DT</th>
      <td>20170308</td>
    </tr>
    <tr>
      <th>RGST_TIME</th>
      <td>171936</td>
    </tr>
    <tr>
      <th>RGST_USER_ID</th>
      <td>IDM</td>
    </tr>
    <tr>
      <th>STUS_CD</th>
      <td>0</td>
    </tr>
  </tbody>
</table>
</div>
in_lizhidan="""988150
988156
988181
988158
988187
988001
988169
88893
85505
8Z0023
8Z0027
8Z0014
50223005
84571
84292
88531
85306
85520
8A0486
88840
922272
916800
922452
922604
922584
922220
922155
922601
919146
971222
971127
971310
971433
971385
971498
971446
971483
975326
975130
975347
939990
975229
975143
975028
88881
20001043
8A0874
21210014
J966334
J919047
81939
8A0455
919142
8A0041
8A0870
974332
953430
974711
974712
962077
978137
908439
908270
908205
978044
8A0563
88410
88444
89911
89943
89793
82976
8A0320
50207224
88253
953874
974572
974699
974412
974532
974702
953981
86980
88987
50223344
961789
961512
961400
961780
961717
961649
961735
961782
961550
961541""".split("\n")
df_merge_filter['IN_LIZHI']='N'
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
def has_lizhi(row):
    if row[0] in in_lizhidan:
        row[2]='Y'
    return row
df_lizhi=df_merge_filter.apply(has_lizhi,axis=1)
df_lizhi
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>EMP_NO1</th>
      <th>EMP_NO2</th>
      <th>IN_LIZHI</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10146</th>
      <td>94527750</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>17858</th>
      <td>94506706</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20395</th>
      <td>A920314</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20396</th>
      <td>85505</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20397</th>
      <td>86980</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20398</th>
      <td>84292</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20399</th>
      <td>85306</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20400</th>
      <td>84571</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20401</th>
      <td>82976</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20402</th>
      <td>916800</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20403</th>
      <td>81939</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20404</th>
      <td>85520</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20405</th>
      <td>953430</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20406</th>
      <td>962077</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20407</th>
      <td>922155</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20408</th>
      <td>922220</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20409</th>
      <td>922272</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20410</th>
      <td>J966334</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20411</th>
      <td>971127</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20412</th>
      <td>953874</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20413</th>
      <td>971222</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20414</th>
      <td>88253</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20415</th>
      <td>988158</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20416</th>
      <td>939990</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20417</th>
      <td>988169</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20418</th>
      <td>988001</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20419</th>
      <td>89793</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20420</th>
      <td>961717</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20421</th>
      <td>974332</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20422</th>
      <td>961550</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>20478</th>
      <td>919146</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20479</th>
      <td>88893</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20480</th>
      <td>A810695</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20481</th>
      <td>978044</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20482</th>
      <td>88881</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20483</th>
      <td>961735</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20484</th>
      <td>975347</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20485</th>
      <td>88987</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20486</th>
      <td>A910397</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20487</th>
      <td>971483</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20488</th>
      <td>A920302</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20489</th>
      <td>8A0874</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20490</th>
      <td>8Z0027</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20491</th>
      <td>8Z0023</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20492</th>
      <td>961780</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20493</th>
      <td>975326</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20494</th>
      <td>974699</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20495</th>
      <td>974711</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20496</th>
      <td>974712</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20497</th>
      <td>8Z0014</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20498</th>
      <td>974702</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20499</th>
      <td>971498</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20500</th>
      <td>A836893</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20501</th>
      <td>A800547</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20502</th>
      <td>21210014</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20503</th>
      <td>961541</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20504</th>
      <td>961789</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20505</th>
      <td>961782</td>
      <td>TUTUTU</td>
      <td>Y</td>
    </tr>
    <tr>
      <th>20506</th>
      <td>96128754</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20507</th>
      <td>TUTUTU</td>
      <td>A875001</td>
      <td>N</td>
    </tr>
  </tbody>
</table>
<p>115 rows × 3 columns</p>
</div>
df_lizhi[df_lizhi.IN_LIZHI=='N']
<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>EMP_NO1</th>
      <th>EMP_NO2</th>
      <th>IN_LIZHI</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10146</th>
      <td>94527750</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>17858</th>
      <td>94506706</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20395</th>
      <td>A920314</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20424</th>
      <td>A920146</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20425</th>
      <td>91026733</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20426</th>
      <td>A920096</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20432</th>
      <td>A970202</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20445</th>
      <td>50226793</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20449</th>
      <td>A920031</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20452</th>
      <td>A350225</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20453</th>
      <td>A300117</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20465</th>
      <td>A920089</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20466</th>
      <td>A920128</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20480</th>
      <td>A810695</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20486</th>
      <td>A910397</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20488</th>
      <td>A920302</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20500</th>
      <td>A836893</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20501</th>
      <td>A800547</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20506</th>
      <td>96128754</td>
      <td>TUTUTU</td>
      <td>N</td>
    </tr>
    <tr>
      <th>20507</th>
      <td>TUTUTU</td>
      <td>A875001</td>
      <td>N</td>
    </tr>
  </tbody>
</table>
</div>

展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部