私は、Webページ上のすべての画像のURLのリストを取得するには次のコードを使用してい

-1

Google画像検索からのURLを取得します。ターゲットは、Googleの画像検索からすべての画像のURLを取得することです。私の問題は、私はわずか20のURLを取得し、URLの写真は小さな写真であり、実際のサイズではありません。私は、Webページ上のすべての画像のURLのリストを取得するには次のコードを使用してい

はここのコードです：

public List<string> FetchImages(string Url) 
{ 
    List<string> imageList = new List<string>(); 

    //Append http:// if necessary 
    if (!Url.StartsWith("http://") && !Url.StartsWith("https://")) 
     Url = "http://" + Url; 

    string responseUrl = string.Empty; 
    string htmlData = ASCIIEncoding.ASCII.GetString(DownloadData(Url, out responseUrl)); 

    if (responseUrl != string.Empty) 
     Url = responseUrl; 

    if (htmlData != string.Empty) 
    { 
     string imageHtmlCode = "<img"; 
     string imageSrcCode = @"src="""; 

     int index = htmlData.IndexOf(imageHtmlCode); 
     while (index != -1) 
     { 
      //Remove previous data 
      htmlData = htmlData.Substring(index); 

      //Find the location of the two quotes that mark the image's location 
      int brackedEnd = htmlData.IndexOf('>'); //make sure data will be inside img tag 
      int start = htmlData.IndexOf(imageSrcCode) + imageSrcCode.Length; 
      int end = htmlData.IndexOf('"', start + 1); 

      //Extract the line 
      if (end > start && start < brackedEnd) 
      { 
       string loc = htmlData.Substring(start, end - start); 

       //Store line 
       imageList.Add(loc); 
      } 

      //Move index to next image location 
      if (imageHtmlCode.Length < htmlData.Length) 
       index = htmlData.IndexOf(imageHtmlCode, imageHtmlCode.Length); 
      else 
       index = -1; 
     } 

     //Format the image URLs 
     for (int i = 0; i < imageList.Count; i++) 
     { 
      string img = imageList[i]; 

      string baseUrl = GetBaseURL(Url); 

      if ((!img.StartsWith("http://") && !img.StartsWith("https://")) 
       && baseUrl != string.Empty) 
       img = baseUrl + "/" + img.TrimStart('/'); 

      imageList[i] = img; 
     } 
    } 

    return imageList; 
} 
private string GetBaseURL(string Url) 
{ 
    int inx = Url.IndexOf("://") + "://".Length; 
    int end = Url.IndexOf('/', inx); 

    string baseUrl = string.Empty; 
    if (end != -1) 
     return Url.Substring(0, end); 
    else 
     return string.Empty; 
} 
private byte[] DownloadData(string Url, out string responseUrl) 
{ 
    byte[] downloadedData = new byte[0]; 
    try 
    { 
     //Get a data stream from the url 
     WebRequest req = WebRequest.Create(Url); 
     WebResponse response = req.GetResponse(); 
     Stream stream = response.GetResponseStream(); 

     responseUrl = response.ResponseUri.ToString(); 

     //Download in chuncks 
     byte[] buffer = new byte[1024]; 

     //Get Total Size 
     int dataLength = (int)response.ContentLength; 

     //Download to memory 
     //Note: adjust the streams here to download directly to the hard drive 
     MemoryStream memStream = new MemoryStream(); 
     while (true) 
     { 
      //Try to read the data 
      int bytesRead = stream.Read(buffer, 0, buffer.Length); 

      if (bytesRead == 0) 
      { 
       break; 
      } 
      else 
      { 
       //Write the downloaded data 
       memStream.Write(buffer, 0, bytesRead); 
      } 
     } 

     //Convert the downloaded stream to a byte array 
     downloadedData = memStream.ToArray(); 

     //Clean up 
     stream.Close(); 
     memStream.Close(); 
    } 
    catch (Exception) 
    { 
     responseUrl = string.Empty; 
     return new byte[0]; 
    } 

    return downloadedData; 
    }

出典

2017-10-01 Tomer Kelman

あなたが唯一の20の画像を取得し、なぜ私は、知りませんが、私は信じて、私は小さな画像を取得するための答えを持っている：実際には、ページのロード時に物理的に存在しない、です。ページ上の要素を調べると、すべてのページがjavascriptで完全に処理され、動的にイメージとURLがユーザーのやりとりに読み込まれることがわかります。プレーンページは実際には小さな画像しか含んでいません。 – Zorak

私は分かりません。 Googleや他のサイトから画像を拝借していますか？ –

Googleから。誰かが他のイメージを知っているかもしれない検索エンジン？ –

Googleが一度に画像の一定量をロードしているようです。あなたのコンピュータをスクロールすると、それらはより多くを読み込みます。それが人口になっていないイメージに到達すると、それはイメージの終わりとみなされることでしょうか？

出典

2017-10-02 02:20:27

私は、Webページ上のすべての画像のURLのリストを取得するには次のコードを使用してい

答えて

関連する問題